Percentage of contribution value: Junting YANG(50%), May HAN (50%)

Introduction

Load libraries.

library(ggplot2)
library(gridExtra)
library(dplyr)
library(ggthemes)
library(numform)
library(timeDate)
library(lubridate)
library(reshape2)
library(ca)
library(tidyr)
library(ape)
library(knitr)
library(ROCR)
library(rpart)
library(rpart.plot)
library(grid)
library(gridExtra)
library(tidyverse)
library(data.table)
library(xgboost)
library(vtreat)
library(caret)
library(stats)
library(caTools)
library(lime)
library(DALEX)
library(pROC)
library(ROCR)
library(gridExtra)
library(ROCit)
library(pander)
library(class)
library(fpc)
library(e1071)
library(grDevices)

Establishing a consistent plotting theme to ensure uniformity in the appearance of all charts.

Load the main data.

youtubedata <- read.csv("Global YouTube Statistics.csv")

Main data transforming and cleaning

youtubedata[youtubedata == "nan"] <- NA
youtubedata[youtubedata == "NaN"] <- NA

month<- c(Jan = 1, Feb = 2, Mar = 3, Apr = 4, May = 5, Jun = 6,
                   Jul = 7, Aug = 8, Sep = 9, Oct = 10, Nov = 11, Dec = 12)
youtubedata$created_month <- month[youtubedata$created_month]
youtubedata$created_month <- as.numeric(youtubedata$created_month)

youtubedata$Gross.tertiary.education.enrollment....[youtubedata$Gross.tertiary.education.enrollment....> 100] <- NA

youtubedataT2 <- youtubedata

#Discovered 79 rows with all four income-related columns containing zeros. Replace these rows with median values
all_zero_rows <- which(youtubedata$lowest_monthly_earnings == 0 & 
                       youtubedata$highest_monthly_earnings == 0 & 
                       youtubedata$lowest_yearly_earnings == 0 & 
                       youtubedata$highest_yearly_earnings == 0)

median_lowest_monthly <- median(youtubedata$lowest_monthly_earnings[youtubedata$lowest_monthly_earnings != 0])
median_highest_monthly <- median(youtubedata$highest_monthly_earnings[youtubedata$highest_monthly_earnings != 0])
median_lowest_yearly <- median(youtubedata$lowest_yearly_earnings[youtubedata$lowest_yearly_earnings != 0])
median_highest_yearly <- median(youtubedata$highest_yearly_earnings[youtubedata$highest_yearly_earnings != 0])

youtubedata$lowest_monthly_earnings[all_zero_rows] <- median_lowest_monthly
youtubedata$highest_monthly_earnings[all_zero_rows] <- median_highest_monthly
youtubedata$lowest_yearly_earnings[all_zero_rows] <- median_lowest_yearly
youtubedata$highest_yearly_earnings[all_zero_rows] <- median_highest_yearly

#Discovered 5 rows with all NA values about created time. Delete these rows.
all_NA_rows <- which(is.na(youtubedata$created_year) & 
                     is.na(youtubedata$created_month) & 
                     is.na(youtubedata$created_date))
youtubedata <- youtubedata[-all_NA_rows, ]

median_views <- median(youtubedata$video_views_for_the_last_30_days, na.rm = TRUE)
youtubedata$video_views_for_the_last_30_days[is.na(youtubedata$video_views_for_the_last_30_days)] <- median_views
median_subscribers <- median(youtubedata$subscribers_for_last_30_days, na.rm = TRUE)
youtubedata$subscribers_for_last_30_days[is.na(youtubedata$subscribers_for_last_30_days)] <- median_subscribers

Part1 - Classification

1 Task 1 - average monthly earnings

1.1 Choose the response (target) variable - average monthly earnings

The selected response (target) variables is average monthly earnings.This variable does not exist in the database, so we created it using a specific method. We chose “average_monthly_earnings” as target variable to classify income levels.Due to the wide range of data with a substantial gap between the lowest and highest values, the median is used as a substitute for the mean.The value for average monthly earnings is represented by the median between the original database values of “lowest_monthly_earnings” and “highest_monthly_earnings”. Due to the right-skewed distribution of income, we selected the median as the threshold for determining high income.The labels assigned to the “average_monthly_earnings” variable are [0,138975) for low income levels and [138975, Inf) for high income levels.

youtubedata <- youtubedata %>%
  mutate(
    average_monthly_earnings = apply(select(., c("lowest_monthly_earnings", "highest_monthly_earnings")), 1, median),
    average_monthly_earnings_category = cut(
      average_monthly_earnings,
      breaks = c(0, 138975 , Inf),
      labels = c("low", "high"),
      right = FALSE
    )
  )
summary(youtubedata$average_monthly_earnings)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0   49038  138975  325318  324150 7225450
table(youtubedata$average_monthly_earnings_category)
## 
##  low high 
##  465  525

Looking at the proportions of outcome variable selected, we can see that there is a fairly equal split between the two categories.

round(prop.table(table(youtubedata$average_monthly_earnings_category)), 2)
## 
##  low high 
## 0.47 0.53

1.2 Single variable models

Preprocess data.

youtubedata1 <- youtubedata[, c("subscribers","video.views", "uploads", "channel_type", "created_year", "created_month", "created_date", "average_monthly_earnings_category")]
youtubedata1 <- youtubedata1[!is.na(youtubedata1$channel_type), ]
youtubedata1$channel_type <- as.factor(youtubedata1$channel_type)

Split the dataset into training, testing set and a validation (or calibration) set.

d <- youtubedata1
set.seed(2333)
d$rg <- runif(dim(d)[1])
dTrainAll <- subset(d, rg<=0.7)
dTest <- subset(d, rg>0.7)
vars <- setdiff(colnames(dTrainAll), c("rg","average_monthly_earnings_category"))
catVars <- vars[sapply(dTrainAll[, vars], class) %in%
c('factor', 'character')]
numericVars <- vars[sapply(dTrainAll[, vars], class) %in%
c('numeric', 'integer')]
# split dTrainAll into a training set and a validation (or calibration) set
useForCal <- rbinom(n=dim(dTrainAll)[1], size=1, prob=0.1)>0
dCal <- subset(dTrainAll, useForCal)

dTrain <- subset(dTrainAll, !useForCal)

Build univariate models.

outcome <- 'average_monthly_earnings_category'
pos <- 'high'

mkPredC <- function(outCol, varCol, appCol) {
pPos <- sum(outCol == pos) / length(outCol)
naTab <- table(as.factor(outCol[is.na(varCol)]))
pPosWna <- (naTab/sum(naTab))[pos]
vTab <- table(as.factor(outCol), varCol)

pPosWv <- (vTab[pos, ] + 1.0e-3*pPos) / (colSums(vTab) + 1.0e-3)
pred <- pPosWv[appCol]
pred[is.na(appCol)] <- pPosWna
pred[is.na(pred)] <- pPos
pred
}

mkPredN <- function(outCol, varCol, appCol) {

cuts <- unique(
quantile(varCol, probs=seq(0, 1, 0.1), na.rm=T))

varC <- cut(varCol,cuts)
appC <- cut(appCol,cuts)
mkPredC(outCol,varC,appC)
}

calcAUC <- function(predcol,outcol) {
perf <- performance(prediction(predcol,outcol == pos),'auc')
as.numeric(perf@y.values)
}

Process all variables, including categorical and numeric variables,and check their AUC scores.

for (v in catVars) {
  pi <- paste('pred', v, sep='')
  dTrain[,pi] <- mkPredC(dTrain[[outcome]], dTrain[[v]], dTrain[[v]])
  dCal[,pi] <- mkPredC(dCal[[outcome]], dCal[[v]], dCal[[v]])
  dTest[,pi] <- mkPredC(dTest[[outcome]], dTest[[v]], dTest[[v]])
  aucTrain <- roc(dTrain[[outcome]], dTrain[[pi]])$auc
  if (aucTrain >= 0.55) {
    aucCal <- roc(dCal[[outcome]], dCal[[pi]])$auc
    print(sprintf(
      "%s: trainAUC: %4.3f; calibrationAUC: %4.3f",
      pi, aucTrain, aucCal
    ))
  }
}
## Setting levels: control = low, case = high
## Setting direction: controls < cases
## Setting levels: control = low, case = high
## Setting direction: controls < cases
## [1] "predchannel_type: trainAUC: 0.625; calibrationAUC: 0.729"
for (v in numericVars) {
  pi <- paste('pred', v, sep='')
  dTrain[,pi] <- mkPredN(dTrain[[outcome]], dTrain[[v]], dTrain[[v]])
  dCal[,pi] <- mkPredN(dCal[[outcome]], dCal[[v]], dCal[[v]])
  dTest[,pi] <- mkPredN(dTest[[outcome]], dTest[[v]], dTest[[v]])
  aucTrain <- roc(dTrain[[outcome]], dTrain[[pi]])$auc
  if (aucTrain >= 0.55) {
    aucCal <- roc(dCal[[outcome]], dCal[[pi]])$auc
    print(sprintf(
      "%s: trainAUC: %4.3f; calibrationAUC: %4.3f",
      pi, aucTrain, aucCal
    ))
  }
}
## Setting levels: control = low, case = high
## Setting direction: controls < cases
## Setting levels: control = low, case = high
## Setting direction: controls < cases
## [1] "predsubscribers: trainAUC: 0.651; calibrationAUC: 0.751"
## Setting levels: control = low, case = high
## Setting direction: controls < cases
## Setting levels: control = low, case = high
## Setting direction: controls < cases
## [1] "predvideo.views: trainAUC: 0.772; calibrationAUC: 0.853"
## Setting levels: control = low, case = high
## Setting direction: controls < cases
## Setting levels: control = low, case = high
## Setting direction: controls < cases
## [1] "preduploads: trainAUC: 0.673; calibrationAUC: 0.734"
## Setting levels: control = low, case = high
## Setting direction: controls < cases
## Setting levels: control = low, case = high
## Setting direction: controls < cases
## [1] "predcreated_year: trainAUC: 0.554; calibrationAUC: 0.715"
## Setting levels: control = low, case = high
## Setting direction: controls < cases
## Setting levels: control = low, case = high
## Setting direction: controls < cases
## [1] "predcreated_month: trainAUC: 0.603; calibrationAUC: 0.768"
## Setting levels: control = low, case = high
## Setting direction: controls < cases
## Setting levels: control = low, case = high
## Setting direction: controls < cases
## [1] "predcreated_date: trainAUC: 0.559; calibrationAUC: 0.700"

It can be observed that the model’s performance on the calibration set is generally better than on the training set, which suggests the presence of overfitting tendencies. To further examine the AUC values, all independent variables are subjected to 100-fold cross-validation.

for (var in catVars) {
  aucs <- rep(0, 100)
  for (rep in 1:length(aucs)) {
    useForCalRep <- rbinom(n = nrow(dTrainAll), size = 1, prob = 0.1) > 0
    predRep <- mkPredC(dTrainAll[!useForCalRep, outcome],
                       dTrainAll[!useForCalRep, var],
                       dTrainAll[useForCalRep, var])
    aucs[rep] <- calcAUC(predRep, dTrainAll[useForCalRep, outcome])
  }
  print(sprintf("%s: mean: %4.3f; sd: %4.3f", var, mean(aucs), sd(aucs)))
}
## [1] "channel_type: mean: 0.585; sd: 0.069"
for (var in numericVars) {
aucs <- rep(0,100)
for (rep in 1:length(aucs)) {
useForCalRep <- rbinom(n=nrow(dTrainAll), size=1, prob=0.1) > 0
predRep <- mkPredN(dTrainAll[!useForCalRep, outcome],
dTrainAll[!useForCalRep, var],
dTrainAll[useForCalRep, var])
aucs[rep] <- calcAUC(predRep, dTrainAll[useForCalRep,outcome])
}
print(sprintf("%s: mean: %4.3f; sd: %4.3f", var, mean(aucs), sd(aucs)))
}
## [1] "subscribers: mean: 0.626; sd: 0.064"
## [1] "video.views: mean: 0.764; sd: 0.050"
## [1] "uploads: mean: 0.636; sd: 0.069"
## [1] "created_year: mean: 0.475; sd: 0.069"
## [1] "created_month: mean: 0.526; sd: 0.076"
## [1] "created_date: mean: 0.471; sd: 0.059"

To validate the calibration set, the results indicate that the “video.views” variable performs the best in the single-variable models, with an average AUC value of 0.754 and a standard deviation of 0.063. Compared to “subscribers,” “video.views” exhibits a higher average AUC value and a smaller standard deviation. “Subscribers” also performs relatively well, while the “created_year,” “created_month,” and “created_date” variables show poorer performance.

Examine the double density plots.

The results from the dual-density curve plot show that, when predicting probabilities based on “video.views,” a threshold of around 0.4 serves as a boundary. When the predicted probability is less than 0.4, YouTubers are more likely to have high earnings, while probabilities greater than 0.4 are associated with lower earnings. Within the predicted probability range of 0.3 to 0.5 for “subscribers,” the distinction in earnings is not significant. For “uploads,” predicted probabilities exceeding 0.5 do not exhibit a clear difference in earnings, but within the range of 0.25 to 0.5, YouTubers are more likely to have lower earnings. When the predicted probability for “subscribers” exceeds 0.3, higher earnings are more likely. Other variables related to the inception time, except for “created_month,” are generally not very significant.

fig1 <- ggplot(dCal) + geom_density(aes(x = predsubscribers, color = as.factor(average_monthly_earnings_category)))
fig2 <- ggplot(dCal) + geom_density(aes(x = predvideo.views, color = as.factor(average_monthly_earnings_category)))
fig3 <- ggplot(dCal) + geom_density(aes(x = preduploads, color = as.factor(average_monthly_earnings_category)))
fig4 <- ggplot(dCal) + geom_density(aes(x = predcreated_year, color = as.factor(average_monthly_earnings_category)))
fig5 <- ggplot(dCal) + geom_density(aes(x = predcreated_month, color = as.factor(average_monthly_earnings_category)))
fig6 <- ggplot(dCal) + geom_density(aes(x = predcreated_date, color = as.factor(average_monthly_earnings_category)))
fig7 <- ggplot(dCal) + geom_density(aes(x = predchannel_type, color = as.factor(average_monthly_earnings_category)))

heights <- rep(3, 7)  
widths <- rep(1, 1)   

grid.arrange(fig1, fig2, fig3, fig4, fig5, fig6, fig7, ncol = 1, heights = heights, widths = widths)

Check the ROC curve.

plot_roc <- function(predcol, outcol, colour_id=2, overlaid=F) {
ROCit_obj <- rocit(score=predcol, class=outcol==pos)
par(new=overlaid)
plot(ROCit_obj, col = c(colour_id, 1),
legend = FALSE, YIndex = FALSE, values = FALSE)
}

plot_roc(dTest$predsubscribers, dTest[, outcome], colour_id = 2) #red
plot_roc(dTest$predvideo.views, dTest[, outcome], colour_id = 3, overlaid = TRUE)#green
plot_roc(dTest$preduploads, dTest[, outcome], colour_id = 4, overlaid = TRUE)#blue
plot_roc(dTest$predcreated_month, dTest[, outcome], colour_id = 5, overlaid = TRUE)#Light blue
plot_roc(dTest$predchannel_type, dTest[, outcome], colour_id = 6, overlaid = TRUE)#Magenta

Based on the area under the curve, it can be observed that ‘video.views’ performs best.

1.3 Null model and the best univariate model.

Create a null model.

Npos <- sum(dTrain[,outcome] == pos)
pred.Null <- Npos / nrow(dTrain)
cat("Proportion of outcome == 1 in dTrain:", pred.Null)
## Proportion of outcome == 1 in dTrain: 0.5439469
TP <- 0; TN <- sum(dCal[,outcome] == -1); 
FP <- 0; FN <- sum(dCal[,outcome] == 1); 
cat("nrow(dCal):", nrow(dCal), "TP:", TP, "TN:", TN, "FP:", FP, "FN:", FN)
## nrow(dCal): 61 TP: 0 TN: 0 FP: 0 FN: 0
(accuracy <- (TP + TN) / nrow(dCal))
## [1] 0
(precision <- TP/(TP + FP))
## [1] NaN
(recall <- TP/(TP + FN))
## [1] NaN
pred.Null <- rep(pred.Null, nrow(dCal))
(AUC <- calcAUC(pred.Null, dCal[,outcome]))
## [1] 0.5

Use log likelihood to select the best univariate model.

pos <- 'high'
# Define a function to compute log likelihood so that we can reuse it.
logLikelihood <- function(ytrue, ypred, epsilon=1e-6) {
sum(ifelse(ytrue==pos, log(ypred+epsilon), log(1-ypred-epsilon)), na.rm=T)
}
# Compute the likelihood of the Null model on the calibration
# set (for the KDD dataset from previous lecture)
outcome <- 'average_monthly_earnings_category'
logNull <- logLikelihood(
dCal[,outcome], sum(dCal[,outcome]==pos)/nrow(dCal)
)
cat(logNull)
## -42.20818
selCatVars <- c()
minDrop <- 10 # may need to adjust this number
for (v in catVars) {
pi <- paste('pred', v, sep='')
devDrop <- 2*(logLikelihood(dCal[,outcome], dCal[,pi]) - logNull)
if (devDrop >= minDrop) {
print(sprintf("%s, deviance reduction: %g", pi, devDrop))
selCatVars <- c(selCatVars, pi)
}
}
## [1] "predchannel_type, deviance reduction: 16.2203"
selNumVars <- c()
minDrop <- 10 # may need to adjust this number
for (v in numericVars) {
pi <- paste('pred', v, sep='')
devDrop <- 2*(logLikelihood(dCal[,outcome], dCal[,pi]) - logNull)
if (devDrop >= minDrop) {
print(sprintf("%s, deviance reduction: %g", pi, devDrop))
selNumVars <- c(selNumVars, pi)
}
}
## [1] "predsubscribers, deviance reduction: 16.4822"
## Warning in log(1 - ypred - epsilon): NaNs produced
## [1] "predvideo.views, deviance reduction: 29.3237"
## [1] "preduploads, deviance reduction: 14.6795"
## [1] "predcreated_year, deviance reduction: 10.3"
## [1] "predcreated_month, deviance reduction: 14.9858"

It’s evident that the “video.views” model is the best univariate model.

1.4 Multivariate Model - Feature Selection.

Forming a set of “candidate” columns for predictors along with the target column. Based on extensive literature, we can make hypotheses to assess potential independent variables and use Chi-square Test to test if these independent variables(“video.views”,“channel_type”,“uploads”,“subscribers” and “created_month”) can impact the classification of the dependent variable. Select the appropriate feature selection method based on the different data types of the independent variables.

# Chi-square Test
x<- youtubedata1[, c("channel_type")]
y <- youtubedata1$average_monthly_earnings_category
chi_sq<- chisq.test(x, y)
p_value <- 1 - pchisq(chi_sq$statistic, df = chi_sq$parameter)
p_value
##    X-squared 
## 2.052656e-07
video_views <- youtubedata1[["video.views"]]
uploads <- youtubedata1[["uploads"]]
subscribers <- youtubedata1[["subscribers"]]
earnings_category <- youtubedata1[["average_monthly_earnings_category"]]

# T-test 
t_test_result1 <- t.test(video_views ~ earnings_category)
t_test_result2 <- t.test(uploads ~ earnings_category)
t_test_result3 <- t.test(subscribers ~ earnings_category)
print(t_test_result1)
## 
##  Welch Two Sample t-test
## 
## data:  video_views by earnings_category
## t = -10.397, df = 663.62, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group low and group high is not equal to 0
## 95 percent confidence interval:
##  -10416062115  -7106830720
## sample estimates:
##  mean in group low mean in group high 
##         6518629286        15280075704
print(t_test_result2)
## 
##  Welch Two Sample t-test
## 
## data:  uploads by earnings_category
## t = -4.776, df = 672.97, p-value = 2.196e-06
## alternative hypothesis: true difference in means between group low and group high is not equal to 0
## 95 percent confidence interval:
##  -14335.974  -5982.593
## sample estimates:
##  mean in group low mean in group high 
##           4132.587          14291.870
print(t_test_result3)
## 
##  Welch Two Sample t-test
## 
## data:  subscribers by earnings_category
## t = -6.0653, df = 854.71, p-value = 1.976e-09
## alternative hypothesis: true difference in means between group low and group high is not equal to 0
## 95 percent confidence interval:
##  -8806150 -4500196
## sample estimates:
##  mean in group low mean in group high 
##           19522418           26175591
# ANOVA
created_month <- youtubedata1[["created_month"]]
earnings_category <- youtubedata1[["average_monthly_earnings_category"]]
anova_model <- aov(created_month ~ earnings_category)
anova_result <- summary(anova_model)
print(anova_result)
##                    Df Sum Sq Mean Sq F value Pr(>F)
## earnings_category   1      1   1.002   0.084  0.772
## Residuals         961  11509  11.976

The results of feature analysis indicate that ‘channel_type,’ ‘video.views,’ ‘uploads,’ and ‘subscribers’ significantly influence the level of average monthly earnings, while ‘created_month’ does not have a significant impact on average monthly earnings. Therefore, ‘created_month’ is excluded as a feature.

1.5 Multivariate Model - Decision tree classifier

Create a multivariate model and examine the statistical analysis results.We will start by building a model using the best-performing “video.views” ,“subscribers” and “channel_type” variables.

Split the data and create a multivariate module

Split the dataset into training and testing sets using the sample.split() function. Create a multivariate model and examine the statistical analysis results.We will start by building a model using the best-performing “video.views” and “channel_type” variables.

set.seed(123)
split <- sample.split(youtubedata1$average_monthly_earnings_category, SplitRatio = 0.7)
training_data1 <- youtubedata1[split, ]
testing_data1 <- youtubedata1[!split, ]


# Compare the levels of the 'channel_type' variable between the training and testing data.
train_levels <- levels(training_data1$channel_type)
test_levels <- levels(testing_data1$channel_type)
if(identical(train_levels, test_levels)) {
  print("The 'channel_type' variable in both the training and testing data contains the same levels.")
} else {
  print("The 'channel_type' variable in both the training and testing data contains the same levels.")
}
## [1] "The 'channel_type' variable in both the training and testing data contains the same levels."
channel_type_data <- youtubedata1[,c("channel_type","video.views","average_monthly_earnings_category")]

tree_model <- rpart(average_monthly_earnings_category ~ ., data = channel_type_data)
summary(tree_model)
## Call:
## rpart(formula = average_monthly_earnings_category ~ ., data = channel_type_data)
##   n= 963 
## 
##          CP nsplit rel error   xerror       xstd
## 1 0.4593407      0 1.0000000 1.000000 0.03404968
## 2 0.0100000      1 0.5406593 0.556044 0.03001683
## 
## Variable importance
##  video.views channel_type 
##           87           13 
## 
## Node number 1: 963 observations,    complexity param=0.4593407
##   predicted class=high  expected loss=0.4724818  P(node) =1
##     class counts:   455   508
##    probabilities: 0.472 0.528 
##   left son=2 (433 obs) right son=3 (530 obs)
##   Primary splits:
##       video.views  < 7041031000 to the left,  improve=113.74000, (0 missing)
##       channel_type splits as  RLLRLRLLRRLLLL, improve= 17.76451, (0 missing)
##   Surrogate splits:
##       channel_type splits as  RLRRRRLLRRRLRL, agree=0.617, adj=0.148, (0 split)
## 
## Node number 2: 433 observations
##   predicted class=low   expected loss=0.2586605  P(node) =0.4496366
##     class counts:   321   112
##    probabilities: 0.741 0.259 
## 
## Node number 3: 530 observations
##   predicted class=high  expected loss=0.2528302  P(node) =0.5503634
##     class counts:   134   396
##    probabilities: 0.253 0.747

According to the summary information of the decision tree model, the model’s complexity parameter (CP) is 0.4593, indicating a moderate level of model complexity. The root node predicts the category as ‘high’ and splits based on the features of the observations. The model forms two child nodes, with the left child node tending to classify observations as ‘low,’ while the right child node leans towards ‘high.’ Particularly, ‘video.views’ plays the most significant role in the model’s classification decisions with an importance value of 87, followed by ‘channel_type’ (13). These results provide insights into how the model operates and the importance of different features, serving as a starting point for further analysis.

Evaluatethe multivariate module

Evaluate the various metrics of this model.

predictions <- predict(tree_model, testing_data1, type = "class")
length(predictions)
## [1] 289
accuracy <- mean(predictions == testing_data1$average_monthly_earnings_category)
cat("Accuracy: ", accuracy, "\n")
## Accuracy:  0.7404844
confusion_matrix <- table(predictions, testing_data1$average_monthly_earnings_category)
print("Confusion Matrix:")
## [1] "Confusion Matrix:"
print(confusion_matrix)
##            
## predictions low high
##        low  101   39
##        high  36  113
TP <- confusion_matrix[2, 2]
FP <- confusion_matrix[1, 2]
FN <- confusion_matrix[2, 1]

precision <- TP / (TP + FP)
recall <- TP / (TP + FN)
f1_score <- 2 * (precision * recall) / (precision + recall)

cat("Precision: ", precision, "\n")
## Precision:  0.7434211
cat("Recall: ", recall, "\n")
## Recall:  0.7583893
cat("F1 Score: ", f1_score, "\n")
## F1 Score:  0.7508306
predictions <- predict(tree_model, testing_data1, type = "vector")

true_labels <- ifelse(testing_data1$average_monthly_earnings_category == "high", 1, 0)

pred_obj <- prediction(predictions, true_labels)

perf <- performance(pred_obj, "tpr", "fpr")

# Calculate AUC
auc_value <- performance(pred_obj, "auc")
cat("AUC Value: ", as.numeric(auc_value@y.values), "\n")
## AUC Value:  0.7403237

In terms of distinguishing between high and low categories, all values related to model evaluation are greater than 0.74. This indicates that the model performs well with high accuracy and the ability to effectively capture true positives while maintaining a high level of precision. The model can be used for classifying instances into high and low categories and demonstrates relatively good performance.

Draw ROC Curve

The results show that the ROC curve indicates a good model fit.

predictions <- predict(tree_model, testing_data1, type = "vector")

true_labels <- ifelse(testing_data1$average_monthly_earnings_category == "high", 1, 0)

pred_obj <- prediction(predictions, true_labels)

# Create a performance object to calculate ROC
perf <- performance(pred_obj, "tpr", "fpr")

# Plot the ROC curve
plot(perf, colorize = TRUE, print.cutoffs.at = seq(0, 1, 0.1), text.adj = c(-0.2, 1.7))

legend("bottomright", legend = c("Model"), col = 1, lwd = 2)

Create a double density plot.

The results show that below a probability of 0.5, it is more likely to predict low income, while above 0.5 in probability, it is more likely to predict high income.

normalized_predictions <- (predictions - min(predictions)) / (max(predictions) - min(predictions))

roc_data <- data.frame(Predicted = normalized_predictions, Actual = true_labels)

# Normalize predictions to the range [0, 1]
normalized_predictions <- (predictions - min(predictions)) / (max(predictions) - min(predictions))

roc_data$Predicted <- as.numeric(normalized_predictions)

fig1 <- ggplot(roc_data) +
  geom_density(aes(x = Predicted, fill = factor(Actual), color = factor(Actual)), alpha = 0.5) +
  xlab("Predicted Probabilities") +
  ylab("Density") 
  

# Plot the density curves with normalized predictions
print(fig1)

Add the variable “subscribers”

Incorporating “subscribers” to see if we can still construct an effective model.

channel_type_data_n <- youtubedata1[,c("channel_type","video.views","subscribers","average_monthly_earnings_category")]

tree_model_n <- rpart(average_monthly_earnings_category ~ ., data = channel_type_data_n)
summary(tree_model_n)
## Call:
## rpart(formula = average_monthly_earnings_category ~ ., data = channel_type_data_n)
##   n= 963 
## 
##          CP nsplit rel error    xerror       xstd
## 1 0.4593407      0 1.0000000 1.0000000 0.03404968
## 2 0.0100000      1 0.5406593 0.5450549 0.02982321
## 
## Variable importance
##  video.views  subscribers channel_type 
##           68           22           10 
## 
## Node number 1: 963 observations,    complexity param=0.4593407
##   predicted class=high  expected loss=0.4724818  P(node) =1
##     class counts:   455   508
##    probabilities: 0.472 0.528 
##   left son=2 (433 obs) right son=3 (530 obs)
##   Primary splits:
##       video.views  < 7041031000 to the left,  improve=113.74000, (0 missing)
##       subscribers  < 21150000   to the left,  improve= 36.33391, (0 missing)
##       channel_type splits as  RLLRLRLLRRLLLL, improve= 17.76451, (0 missing)
##   Surrogate splits:
##       subscribers  < 16050000   to the left,  agree=0.698, adj=0.328, (0 split)
##       channel_type splits as  RLRRRRLLRRRLRL, agree=0.617, adj=0.148, (0 split)
## 
## Node number 2: 433 observations
##   predicted class=low   expected loss=0.2586605  P(node) =0.4496366
##     class counts:   321   112
##    probabilities: 0.741 0.259 
## 
## Node number 3: 530 observations
##   predicted class=high  expected loss=0.2528302  P(node) =0.5503634
##     class counts:   134   396
##    probabilities: 0.253 0.747

the model’s complexity parameter (CP) is 0.4593. The values are the same as in the previous bivariate model, indicating a moderate level of model complexity. ‘video.views’ plays the most significant role in the model’s classification decisions with an importance value of 68, followed by ‘subscribers’ (22) and ‘channel_type’ (13).

Evaluate the various metrics of this model.

predictions_n <- predict(tree_model_n, testing_data1, type = "class")
length(predictions_n)
## [1] 289
accuracy <- mean(predictions_n == testing_data1$average_monthly_earnings_category)
cat("Accuracy: ", accuracy, "\n")
## Accuracy:  0.7404844
confusion_matrix <- table(predictions_n, testing_data1$average_monthly_earnings_category)
print("Confusion Matrix:")
## [1] "Confusion Matrix:"
print(confusion_matrix)
##              
## predictions_n low high
##          low  101   39
##          high  36  113
# Calculate precision, recall, and F1 score
TP <- confusion_matrix[2, 2]
FP <- confusion_matrix[1, 2]
FN <- confusion_matrix[2, 1]

precision <- TP / (TP + FP)
recall <- TP / (TP + FN)
f1_score <- 2 * (precision * recall) / (precision + recall)

cat("Precision: ", precision, "\n")
## Precision:  0.7434211
cat("Recall: ", recall, "\n")
## Recall:  0.7583893
cat("F1 Score: ", f1_score, "\n")
## F1 Score:  0.7508306
predictions <- predict(tree_model_n, testing_data1, type = "vector")

true_labels <- ifelse(testing_data1$average_monthly_earnings_category == "high", 1, 0)

pred_obj <- prediction(predictions, true_labels)

# Create a performance object to calculate ROC
perf <- performance(pred_obj, "tpr", "fpr")
# Calculate AUC
auc_value <- performance(pred_obj, "auc")
cat("AUC Value: ", as.numeric(auc_value@y.values), "\n")
## AUC Value:  0.7403237

The consistent results across different values suggest that the relationship between the “subscribers” variable and the target variable is not sufficiently strong to significantly improve the model’s performance.

1.6 Multivariate Model - K-Nearest Neighbours

Continue to use “video.views” ,“uploads” and “channel_type” as predictor variables.

k <- 15

features <- c("subscribers", "video.views","uploads")

# Train the k-NN model
knn_model <- knn(train = training_data1[, features], test = testing_data1[, features], cl = training_data1$average_monthly_earnings_category, k = k)

accuracy <- mean(knn_model == testing_data1$average_monthly_earnings_category)
cat("Accuracy: ", accuracy, "\n")
## Accuracy:  0.733564
confusion_matrix <- table(knn_model, testing_data1$average_monthly_earnings_category)
print("Confusion Matrix:")
## [1] "Confusion Matrix:"
print(confusion_matrix)
##          
## knn_model low high
##      low  102   42
##      high  35  110
TP <- confusion_matrix[2, 2]
FP <- confusion_matrix[1, 2]
FN <- confusion_matrix[2, 1]

precision <- TP / (TP + FP)
recall <- TP / (TP + FN)
f1_score <- 2 * (precision * recall) / (precision + recall)

cat("Precision: ", precision, "\n")
## Precision:  0.7236842
cat("Recall: ", recall, "\n")
## Recall:  0.7586207
cat("F1 Score: ", f1_score, "\n")
## F1 Score:  0.7407407
predictions <- as.numeric(knn_model == "high")  # Assuming "high" is the positive class
true_labels <- as.numeric(testing_data1$average_monthly_earnings_category == "high")

pred_obj <- prediction(predictions, true_labels)

perf <- performance(pred_obj, "tpr", "fpr")

auc_value <- performance(pred_obj, "auc")
cat("AUC Value: ", auc_value@y.values[[1]], "\n")
## AUC Value:  0.7341049

In summary, the K-Nearest Neighbors (KNN) model performs well in the classification task, demonstrating high accuracy and reasonable precision and recall.

Draw the ROC Curve

The results show that the ROC curve indicates a good model fit.

pred_obj <- prediction(predictions, true_labels)

perf <- performance(pred_obj, "tpr", "fpr")

plot(perf, colorize = TRUE)

legend("bottomright", legend = c("Model"), col = 1, lwd = 2)

Plot the density curves

fig1 <- ggplot(roc_data) +
  geom_density(aes(x = Predicted, fill = factor(Actual), color = factor(Actual)), alpha = 0.5) +
  xlab("Predicted Probabilities") +
  ylab("Density") +
  scale_fill_manual(values = c("blue", "red")) +
  scale_color_manual(values = c("blue", "red")) +
  theme_minimal()

# Plot the density curves
print(fig1)

The results displayed in this graph are similar to the results of the decision tree dual-density plot. The results show that below a probability of 0.5, it is more likely to predict low income, while above 0.5 in probability, it is more likely to predict high income.

1.7 Model Comparison and Evaluation.

Based on the comparison of accuracy value, prediction value, recall value, F1 score, and AUC value, it can be observed that the decision tree model fits better with respective values of 0.740, 0.743, 0.758, 0.750, and 0.740. In contrast, the K-Nearest Neighbors model has values of 0.734, 0.724, 0.759, 0.740, and 0.734, slightly lower than the decision tree. However, overall, both models exhibit similar fitting performance, and both perform well.

1.8 Using LIME to explain XGBoost model

Create a XGBoost model and check the values related to model evaluation and use LIME to explain the model.

youtubedata1$channel_type <- as.factor(youtubedata1$channel_type)
youtubedata1$channel_type <- as.numeric(youtubedata1$channel_type)
# Set seed for reproducibility
set.seed(123)

split <- sample.split(youtubedata1$average_monthly_earnings_category, SplitRatio = 0.7)
training_data1 <- youtubedata1[split, ]
testing_data1 <- youtubedata1[!split, ]

features <- c("channel_type", "video.views",'subscribers')
target_variable <- "average_monthly_earnings_category"

train_matrix <- as.matrix(training_data1[, features])
test_matrix <- as.matrix(testing_data1[, features])

label_vector <- ifelse(training_data1$average_monthly_earnings_category == "high", 1, 0)

cat("Data type of variable_matrix: ", class(train_matrix), "\n")
## Data type of variable_matrix:  matrix array
cat("Data type of labelvec: ", class(label_vector), "\n")
## Data type of labelvec:  numeric
fit_iris_example = function(variable_matrix, labelvec) {
  cv <- xgb.cv(variable_matrix, label = labelvec,
               params=list(
                 objective="binary:logistic"
               ),
               nfold=5,
               nrounds=100,
               print_every_n=10,
               metrics="logloss")

  evalframe <- as.data.frame(cv$evaluation_log)
  NROUNDS <- which.min(evalframe$test_logloss_mean)
  
  model <- xgboost(data=variable_matrix, label=labelvec,
                   params=list(
                     objective="binary:logistic"
                   ),
                   nrounds=NROUNDS,
                   verbose=FALSE)

  model
}
model <- fit_iris_example(train_matrix, label_vector)
## [1]  train-logloss:0.597899+0.003012 test-logloss:0.634807+0.009348 
## [11] train-logloss:0.355487+0.016714 test-logloss:0.611853+0.052903 
## [21] train-logloss:0.299579+0.012043 test-logloss:0.643771+0.065567 
## [31] train-logloss:0.259564+0.015731 test-logloss:0.671163+0.083254 
## [41] train-logloss:0.225813+0.012342 test-logloss:0.691175+0.094234 
## [51] train-logloss:0.199241+0.013845 test-logloss:0.716176+0.097553 
## [61] train-logloss:0.179809+0.009452 test-logloss:0.735592+0.099557 
## [71] train-logloss:0.168270+0.008377 test-logloss:0.748314+0.104175 
## [81] train-logloss:0.155626+0.008458 test-logloss:0.768456+0.107509 
## [91] train-logloss:0.139870+0.007076 test-logloss:0.783039+0.112420 
## [100]    train-logloss:0.128259+0.005499 test-logloss:0.800747+0.112117
predictions <- predict(model, test_matrix)

accuracy <- mean(predictions > 0.5)  # Assuming threshold of 0.5
cat("Accuracy: ", accuracy, "\n")
## Accuracy:  0.5432526
confusion_matrix <- table(predictions = (predictions > 0.5), actual = testing_data1$average_monthly_earnings_category == "high")

confusion_matrix <- confusionMatrix(confusion_matrix)
precision <- confusion_matrix$byClass["Pos Pred Value"]
recall <- confusion_matrix$byClass["Sensitivity"]
f1_score <- confusion_matrix$byClass["F1"]

cat("Precision: ", precision, "\n")
## Precision:  0.719697
cat("Recall: ", recall, "\n")
## Recall:  0.6934307
cat("F1 Score: ", f1_score, "\n")
## F1 Score:  0.7063197
labels <- testing_data1$average_monthly_earnings_category == "high"  # Replace with the correct label condition
predictions_obj <- prediction(predictions, labels)

perf <- performance(predictions_obj, "tpr", "fpr")

auc_value <- performance(predictions_obj, "auc")
cat("AUC Value: ", as.numeric(auc_value@y.values), "\n")
## AUC Value:  0.7434691
explainer <- lime(training_data1[features], model = model, 
                  bin_continuous = TRUE, n_bins = 10)

cases <- c(3,11,21,30)
(example <- testing_data1[cases,features])
##    channel_type  video.views subscribers
## 5             5 148000000000   159000000
## 28            9  29533230328    61000000
## 56            5   7828610828    44700000
## 93            5  18208196857    37600000
explanation <- lime::explain(example, explainer, n_labels = 1, n_features = 4)
plot_features(explanation)

LIME has generated a series of plots, among which the feature weight plot shows that “video.views” has the highest weight, contributing the most to the model’s predictions, followed by “subscribers” and “channel_type”. The case probability plot reveals that when the case is 28, the probability is highest, reaching 0.88. However, as the case number increases beyond 28, the probability decreases. Support plots and contrast plots demonstrate that only the “channel_type” variable exhibits a contrasting effect at the case of 28, while the others provide support.

2 Task 2 - Unemployment.rate

2.1 Choose the response (target) variable - Unemployment.rate

The selected response (target) variables is unemployment rate . The feature variables are “Abbreviation”,“Urban_population”,“Population”,“Gross.tertiary.education.enrollment….”

2.2 Data Processing and Preparation

apply(is.na(youtubedataT2), 2, sum)
##                                    rank                                Youtuber 
##                                       0                                       0 
##                             subscribers                             video.views 
##                                       0                                       0 
##                                category                                   Title 
##                                      46                                       0 
##                                 uploads                                 Country 
##                                       0                                     122 
##                            Abbreviation                            channel_type 
##                                     122                                      30 
##                        video_views_rank                            country_rank 
##                                       1                                     116 
##                       channel_type_rank        video_views_for_the_last_30_days 
##                                      33                                      56 
##                 lowest_monthly_earnings                highest_monthly_earnings 
##                                       0                                       0 
##                  lowest_yearly_earnings                 highest_yearly_earnings 
##                                       0                                       0 
##            subscribers_for_last_30_days                            created_year 
##                                     337                                       5 
##                           created_month                            created_date 
##                                       5                                       5 
## Gross.tertiary.education.enrollment....                              Population 
##                                     132                                     123 
##                       Unemployment.rate                        Urban_population 
##                                     123                                     123 
##                                Latitude                               Longitude 
##                                     123                                     123

Based on this result, it was found that both the country and abbreviation columns have 122 missing values. To check if it’s possible to combine the country and its abbreviation, let’s examine if the locations of missing values in these two columns are the same.

count_suspected_na_rows <- sum(rowSums(is.na(youtubedataT2[, c("Country", "Abbreviation")])) == 2)
print(count_suspected_na_rows)
## [1] 122

As obtained in the previous analysis, the missing values in the columns of Population, Unemployment Rate, and Urban Population are also the same. To verify if these missing values correspond to the same rows, it helps determine whether they can be considered as redundant data and be directly removed.

count_suspected_na_rows <- sum(rowSums(is.na(youtubedataT2[, c("Population", "Unemployment.rate", "Urban_population")])) == 3)
print(count_suspected_na_rows)
## [1] 123

The results show that the missing positions are the same and can be deleted together.

Examine the relationship between country and the missing values in the unemployment rate.

unemployment_na_rows <- which(is.na(youtubedataT2$Unemployment.rate))
country_na_rows <- which(is.na(youtubedataT2$Country))
common_na_rows <- intersect(unemployment_na_rows, country_na_rows)
different_na_rows <- setdiff(union(unemployment_na_rows, country_na_rows), common_na_rows)
cat("Same rows with NA values:", length(common_na_rows), "\n")
## Same rows with NA values: 122
cat("Same rows with NA values:", length(different_na_rows), "\n")
## Same rows with NA values: 1

Here it was found that one row is different; let’s check the situation for this particular row.

different_row_number <- different_na_rows[1] 
different_row_data <- youtubedataT2[different_row_number, c("Country", "Population", "Unemployment.rate", "Urban_population")]
print(different_row_data)
##     Country Population Unemployment.rate Urban_population
## 664 Andorra         NA                NA               NA

This row is invalid. Therefore, we will construct a new data frame by removing all rows with NA values in the Unemployment Rate column.

datanew <- youtubedataT2 %>%
  select(Country, Population, Gross.tertiary.education.enrollment...., Unemployment.rate, Urban_population) %>%
  filter(!is.na(Unemployment.rate))

summary(datanew)
##    Country            Population        Gross.tertiary.education.enrollment....
##  Length:872         Min.   :2.025e+05   Min.   : 7.60                          
##  Class :character   1st Qu.:8.336e+07   1st Qu.:36.30                          
##  Mode  :character   Median :3.282e+08   Median :68.00                          
##                     Mean   :4.304e+08   Mean   :63.11                          
##                     3rd Qu.:3.282e+08   3rd Qu.:88.20                          
##                     Max.   :1.398e+09   Max.   :94.30                          
##                                         NA's   :9                              
##  Unemployment.rate Urban_population   
##  Min.   : 0.750    Min.   :    35588  
##  1st Qu.: 5.270    1st Qu.: 55908316  
##  Median : 9.365    Median :270663028  
##  Mean   : 9.279    Mean   :224214982  
##  3rd Qu.:14.700    3rd Qu.:270663028  
##  Max.   :14.720    Max.   :842933962  
## 

We have identified 9 observations with missing values in the Education Rate column. Let’s print them:

na_rows <- datanew[is.na(datanew$Gross.tertiary.education.enrollment....), ]
num_na_rows <- which(is.na(datanew$Gross.tertiary.education.enrollment....))
print(na_rows)
##       Country Population Gross.tertiary.education.enrollment....
## 162 Australia   25766605                                      NA
## 208 Australia   25766605                                      NA
## 319 Australia   25766605                                      NA
## 407 Australia   25766605                                      NA
## 434 Australia   25766605                                      NA
## 447 Australia   25766605                                      NA
## 450 Australia   25766605                                      NA
## 611 Australia   25766605                                      NA
## 682 Australia   25766605                                      NA
##     Unemployment.rate Urban_population
## 162              5.27         21844756
## 208              5.27         21844756
## 319              5.27         21844756
## 407              5.27         21844756
## 434              5.27         21844756
## 447              5.27         21844756
## 450              5.27         21844756
## 611              5.27         21844756
## 682              5.27         21844756

All of them are from Australia, which we can consider as systematic missing values. We will fill them with the median value.

median_GTEE <- median(datanew$Gross.tertiary.education.enrollment...., na.rm = TRUE)

datanew$Gross.tertiary.education.enrollment....[is.na(datanew$Gross.tertiary.education.enrollment....)] <- median_GTEE

Although there are many data points in the table, for the same country, the values of tertiary_education, population, and urban_population are the same. We will create a new data frame that includes only unique rows.

data <- unique(datanew)
summary(data)
##    Country            Population        Gross.tertiary.education.enrollment....
##  Length:48          Min.   :2.025e+05   Min.   : 7.60                          
##  Class :character   1st Qu.:1.583e+07   1st Qu.:35.80                          
##  Mode  :character   Median :4.185e+07   Median :57.45                          
##                     Mean   :1.200e+08   Mean   :55.17                          
##                     3rd Qu.:9.744e+07   3rd Qu.:72.85                          
##                     Max.   :1.398e+09   Max.   :94.30                          
##  Unemployment.rate Urban_population   
##  Min.   : 0.750    Min.   :    35588  
##  1st Qu.: 3.743    1st Qu.:  9651217  
##  Median : 5.315    Median : 30732090  
##  Mean   : 6.507    Mean   : 69730921  
##  3rd Qu.: 9.193    3rd Qu.: 57178091  
##  Max.   :14.720    Max.   :842933962
str(data)
## 'data.frame':    48 obs. of  5 variables:
##  $ Country                                : chr  "India" "United States" "Japan" "Russia" ...
##  $ Population                             : num  1.37e+09 3.28e+08 1.26e+08 1.44e+08 5.17e+07 ...
##  $ Gross.tertiary.education.enrollment....: num  28.1 88.2 63.2 81.9 94.3 60 68.9 51.3 90 88.5 ...
##  $ Unemployment.rate                      : num  5.36 14.7 2.29 4.59 4.15 ...
##  $ Urban_population                       : num  4.71e+08 2.71e+08 1.16e+08 1.08e+08 4.21e+07 ...
df <- data

At this point, we have completed the initial data cleaning, removed missing values and outliers, and obtained a cleaned dataset ready for machine learning.

2.3 Single variable models

Binary Classification of the Target Variable

Based on the data characteristics, we have chosen the mean value as the classification threshold.

mean_unemployment_rate <- mean(data$Unemployment.rate)
data$Unemployment <- ifelse(data$Unemployment.rate > mean_unemployment_rate, "1", "-1")
data <- data %>% select(-Unemployment.rate)
print(table(data$Unemployment))
## 
## -1  1 
## 28 20
summary(data)
##    Country            Population        Gross.tertiary.education.enrollment....
##  Length:48          Min.   :2.025e+05   Min.   : 7.60                          
##  Class :character   1st Qu.:1.583e+07   1st Qu.:35.80                          
##  Mode  :character   Median :4.185e+07   Median :57.45                          
##                     Mean   :1.200e+08   Mean   :55.17                          
##                     3rd Qu.:9.744e+07   3rd Qu.:72.85                          
##                     Max.   :1.398e+09   Max.   :94.30                          
##  Urban_population    Unemployment      
##  Min.   :    35588   Length:48         
##  1st Qu.:  9651217   Class :character  
##  Median : 30732090   Mode  :character  
##  Mean   : 69730921                     
##  3rd Qu.: 57178091                     
##  Max.   :842933962

Splitting the Dataset

Since the dataset is relatively small, we choose to split the data into a training set and a test set, while also setting aside a calibration set for better model tuning.

d <- data %>%
  select(-Country)
set.seed(2333)
d$rg <- runif(dim(d)[1])
dTrainAll <- subset(d, rg<=0.7)
dTest <- subset(d, rg>0.7)
vars <- setdiff(colnames(dTrainAll), c('rg', 'Unemployment'))
# split dTrainAll into a training set and a validation (or calibration) set
useForCal <- rbinom(n=dim(dTrainAll)[1], size=1, prob=0.1)>0
dCal <- subset(dTrainAll, useForCal)
dTrain <- subset(dTrainAll, !useForCal)

Print the three subsets and check if each subset contains values for the target variable as ‘1’ and ‘-1’:

table(dTrain[, 'Unemployment'])
## 
## -1  1 
## 15 12
table(dCal[, 'Unemployment'])
## 
## -1  1 
##  3  2
table(dTest[, 'Unemployment'])
## 
## -1  1 
## 10  6

Build univariate models for numeric variables.

As mentioned in the lecture, we construct the mkPredC function and mkPredN function for numeric variables.

outcome <- 'Unemployment'
pos <- '1'

mkPredC <- function(outCol, varCol, appCol) {
pPos <- sum(outCol == pos) / length(outCol)
naTab <- table(as.factor(outCol[is.na(varCol)]))
pPosWna <- (naTab/sum(naTab))[pos]
vTab <- table(as.factor(outCol), varCol)
#print(vTab)
pPosWv <- (vTab[pos, ] + 1.0e-3*pPos) / (colSums(vTab) + 1.0e-3)
pred <- pPosWv[appCol]
pred[is.na(appCol)] <- pPosWna
pred[is.na(pred)] <- pPos
pred
}

mkPredN <- function(outCol, varCol, appCol) {
# compute the cuts
cuts <- unique(
quantile(varCol, probs=seq(0, 1, 0.1), na.rm=T))
# discretize the numerical columns
varC <- cut(varCol,cuts)
appC <- cut(appCol,cuts)
mkPredC(outCol,varC,appC)
}

calcAUC <- function(predcol,outcol) {
perf <- performance(prediction(predcol,outcol == pos),'auc')
as.numeric(perf@y.values)
}

Process all variables.

for(v in vars) {
pi <- paste('pred', v, sep='')
dTrain[,pi] <- mkPredN(dTrain[,outcome], dTrain[,v], dTrain[,v])
dCal[,pi] <- mkPredN(dTrain[,outcome], dTrain[,v], dCal[,v])
dTest[,pi] <- mkPredN(dTrain[,outcome], dTrain[,v], dTest[,v])
aucTrain <- calcAUC(dTrain[,pi], dTrain[,outcome])
if(aucTrain >= 0.55) {
aucCal <- calcAUC(dCal[,pi], dCal[,outcome])
print(sprintf(
"%s: trainAUC: %4.3f; calibrationAUC: %4.3f",
pi, aucTrain, aucCal))
}
}
## [1] "predPopulation: trainAUC: 0.872; calibrationAUC: 0.667"
## [1] "predGross.tertiary.education.enrollment....: trainAUC: 0.864; calibrationAUC: 1.000"
## [1] "predUrban_population: trainAUC: 0.828; calibrationAUC: 0.500"

Univariate model evaluation.

Further check the AUC values using 100-fold cross-validation. Since the dataset is small, there is a risk that random splits may result in one group becoming a single classification problem, so we include a loop condition to ensure that each group is always binary.

for (var in vars) {
aucs <- rep(0,100)
for (rep in 1:length(aucs)) {
useForCalRep <- rbinom(n=nrow(dTrainAll), size=1, prob=0.1) > 0
# Check if the training subset contains only one class
while (length(unique(dTrainAll[useForCalRep, outcome])) < 2) {
  useForCalRep <- rbinom(n = nrow(dTrainAll), size = 1, prob = 0.1) > 0
}
predRep <- mkPredN(dTrainAll[!useForCalRep, outcome],
dTrainAll[!useForCalRep, var],
dTrainAll[useForCalRep, var])
aucs[rep] <- calcAUC(predRep, dTrainAll[useForCalRep,outcome])
}
print(sprintf("%s: mean: %4.3f; sd: %4.3f", var, mean(aucs), sd(aucs)))
}
## [1] "Population: mean: 0.615; sd: 0.333"
## [1] "Gross.tertiary.education.enrollment....: mean: 0.683; sd: 0.322"
## [1] "Urban_population: mean: 0.391; sd: 0.294"

Based on the results, the “Population” variable has an average AUC value of 0.615 with a standard deviation of 0.333. This means that models built using the “Population” variable have average performance and may exhibit significant fluctuations in performance. However, the average AUC value of this variable is the closest to the original calibration set results.

The “Gross.tertiary.education.enrollment….” variable has an average AUC value of 0.683 with a standard deviation of 0.322. This indicates that models constructed using the “Gross.tertiary.education.enrollment….” variable have relatively high average performance, but there is still considerable variability in performance across different cross-validations. In the previous analysis, we observed that this variable performed better on the calibration set than on the training set, but in this analysis, we see that the average AUC value is still lower than that of the training set. The seemingly perfect results are due to the high variability of random effects. Nevertheless, the original AUC value is already close to the maximum, indicating that the model’s performance is quite commendable.

“Urban_population” has an average AUC value of 0.391 with a standard deviation of 0.294. Models built using this variable exhibit lower average performance, and performance fluctuates relatively more.

Overall, the “Gross.tertiary.education.enrollment….” variable appears to perform the best among these univariate models, with the highest average AUC value. The “Urban_population” variable has relatively lower performance.

Double density plot:

fig1 <- ggplot(dCal) + geom_density(aes(x=predPopulation, color=as.factor(Unemployment)))
fig2 <- ggplot(dCal) + geom_density(aes(x=predGross.tertiary.education.enrollment...., color=as.factor(Unemployment)))
fig3 <- ggplot(dCal) + geom_density(aes(x=predUrban_population, color=as.factor(Unemployment)))
library(gridExtra)

heights <- c(3, 3, 3)  
widths <- c(3) 
grid.arrange(fig1, fig2, fig3, ncol = 1, heights = heights, widths = widths)

The analysis shows that for all three variables, the heights of the negative curves are significantly higher than the positive curves in most regions. This may suggest that the model is more accurate in predicting the negative category within these value ranges. Additionally, we can observe that when the predicted values of the variables are relatively high, the accuracy of predicting the positive variable tends to be better. This trend is present for all three variables. The “Population” variable exhibits more overlap between the positive and negative curves, indicating that it might be challenging to distinguish between these two categories within these value ranges. Overall, the double density plots show that the performance of all three variables is quite similar.

Finally, let’s take a look at the ROC curves for the test set:

plot_roc <- function(predcol, outcol, colour_id=2, overlaid=F) {
ROCit_obj <- rocit(score=predcol, class=outcol==pos)
par(new=overlaid)
plot(ROCit_obj, col = c(colour_id, 1),
legend = FALSE, YIndex = FALSE, values = FALSE)
}

calcAUC(dTrain[,"predPopulation"], dTrain[,outcome]);calcAUC(dCal[,"predPopulation"], dCal[,outcome]);calcAUC(dTest[,"predPopulation"], dTest[,outcome]); 
## [1] 0.8722222
## [1] 0.6666667
## [1] 0.4166667
calcAUC(dTrain[,"predGross.tertiary.education.enrollment...."], dTrain[,outcome]);calcAUC(dCal[,"predGross.tertiary.education.enrollment...."], dCal[,outcome]);calcAUC(dTest[,"predGross.tertiary.education.enrollment...."], dTest[,outcome]); 
## [1] 0.8638889
## [1] 1
## [1] 0.4166667
calcAUC(dTrain[,"predUrban_population"], dTrain[,outcome]);calcAUC(dCal[,"predUrban_population"], dCal[,outcome]);calcAUC(dTest[,"predUrban_population"], dTest[,outcome]); 
## [1] 0.8277778
## [1] 0.5
## [1] 0.625
plot_roc(dTest$predPopulation, dTest[,outcome]) #red
plot_roc(dTest$predGross.tertiary.education.enrollment...., dTest[,outcome], colour_id=3, overlaid=T) #green
plot_roc(dTest$predUrban_population, dTest[,outcome], colour_id=4, overlaid=T) #blue

The ROC curve for the “Urban_population” variable clearly shows the best performance, with the largest area under the curve (AUC) in the test set, while the other two variables have similar AUC values.

Therefore, in the overall evaluation, “Urban_population” performs best on the test set, but it exhibits poorer performance in cross-validation. On the other hand, “Gross.tertiary.education.enrollment….” performs best in cross-validation, and its double density plots and ROC curves are also decent. Thus, we can conclude that “Gross.tertiary.education.enrollment….” is the best single-variable model.

2.4 Null Model Analysis

###Construction and Evaluation of Null Models

Npos <- sum(dTrain[,outcome] == pos)
pred.Null <- Npos / nrow(dTrain)
cat("Proportion of outcome == 1 in dTrain:", pred.Null)
## Proportion of outcome == 1 in dTrain: 0.4444444

Because pred.Null is 0.4444444, we set the threshold to 0.5. Therefore, the model will predict all as ‘negative.’ Evaluate the validation set and calculate the relevant evaluation metrics.

TP <- 0; TN <- sum(dCal[,outcome] == -1); 
FP <- 0; FN <- sum(dCal[,outcome] == 1); 
cat("nrow(dCal):", nrow(dCal), "TP:", TP, "TN:", TN, "FP:", FP, "FN:", FN)
## nrow(dCal): 5 TP: 0 TN: 3 FP: 0 FN: 2
(accuracy <- (TP + TN) / nrow(dCal))
## [1] 0.6
(precision <- TP/(TP + FP))
## [1] NaN
(recall <- TP/(TP + FN))
## [1] 0
pred.Null <- rep(pred.Null, nrow(dCal))
(AUC <- calcAUC(pred.Null, dCal[,outcome]))
## [1] 0.5

After calculating, the AUC value of the null model on the validation set is 0.5. In contrast, the AUC values of our univariate model on the validation set are all greater than or equal to 0.5. This demonstrates that the performance of the univariate model is superior to the null model. Subsequently, we can use these two models to evaluate other complex models.

2.5 Multivariate Model-Feature Selection

We use log-likelihood to further perform feature selection.

logLikelihood <- function(ytrue, ypred, epsilon=1e-6) {
sum(ifelse(ytrue==pos, log(ypred+epsilon), log(1-ypred-epsilon)), na.rm=T)
}

Following the calculation method of log-likelihood, we first perform the calculation for the null model.

logNull <- logLikelihood(
dCal[,outcome], sum(dCal[,outcome]==pos)/nrow(dCal)
)
cat(logNull)
## -3.365058

Next, we iterate through all numerical variables and select variables based on the extent to which they reduce bias relative to zero.

selVars <- c()
for (v in vars) {
pi <- paste('pred', v, sep='')
devDrop <- 2*(logLikelihood(dCal[,outcome], dCal[,pi]) - logNull)
print(sprintf("%s, deviance reduction: %g", pi, devDrop))
selVars <- c(selVars, pi)
}
## [1] "predPopulation, deviance reduction: 0.1391"
## [1] "predGross.tertiary.education.enrollment...., deviance reduction: 3.48528"
## [1] "predUrban_population, deviance reduction: -16.6714"

Based on this result, we can see that the “predGross.tertiary.education.enrollment….” variable has the highest deviance reduction value, indicating that adding this numerical variable will have a more significant positive impact on improving the model’s fit and reducing bias.

On the other hand, the “predUrban_population” variable has a negative deviance reduction value, suggesting that including this numerical variable would worsen the model’s fit, increasing bias. This variable does not provide strong predictive information for the target variable.

Therefore, we are considering using “predPopulation” and “predGross.tertiary.education.enrollment….” as feature variables for multivariate analysis and experimenting with the inclusion of the “predUrban_population” variable in different models to observe its impact on the models.

2.6 Multivariate Model - Decision tree classifier

Create a multivariate model and examine the statistical analysis results.We will start by building a model using the best-performing “predPopulation”和”predGross.tertiary.education.enrollment….” variables.

First, we use the two high-performing variables to build a decision tree model called “t2model” and output the AUC value.

(fV <- paste(outcome,'> 0 ~ ',
paste(c("predPopulation", "predGross.tertiary.education.enrollment...."), collapse=' + '),
sep=''))
## [1] "Unemployment> 0 ~ predPopulation + predGross.tertiary.education.enrollment...."
t2model <- rpart(fV, data=dTrain)
print(calcAUC(predict(t2model, newdata=dTrain), dTrain[,outcome]))
## [1] 0.7833333
print(calcAUC(predict(t2model, newdata=dCal), dCal[,outcome]))
## [1] 1
print(calcAUC(predict(t2model, newdata=dTest), dTest[,outcome]))
## [1] 0.4666667

We noticed that the performance of this model is not better than the univariate model. However, achieving a perfect score of 1 on the validation set is a positive result that we are happy to see.

Next, we will calculate performance metrics such as accuracy, precision, recall, and F1 score to evaluate the model.

To begin, let’s create a function to calculate these results for repeated use:

performanceMeasures <- function(ytrue, ypred, model.name = "model", threshold=0.5) {
# compute the normalised deviance
dev.norm <- -2 * logLikelihood(ytrue, ypred)/length(ypred)
# compute the confusion matrix
cmat <- table(actual = ytrue, predicted = ypred >= threshold)
accuracy <- sum(diag(cmat)) / sum(cmat)
precision <- cmat[2, 2] / sum(cmat[, 2])
recall <- cmat[2, 2] / sum(cmat[2, ])
f1 <- 2 * precision * recall / (precision + recall)
data.frame(model = model.name, precision = precision,
recall = recall, f1 = f1, dev.norm = dev.norm)
}

For more visually appealing output, we can define a function to set up Pander formatting:

panderOpt <- function(){
library(pander)
# setting up Pander Options
panderOptions("plain.ascii", TRUE)
panderOptions("keep.trailing.zeros", TRUE)
panderOptions("table.style", "simple")
}

Let’s create a function for tabular output:

t_pretty_perf_table <- function(model, xtrain, ytrain, xcal, ycal,
xtest, ytest, threshold=0.5) {
# Option setting for Pander
panderOpt()
perf_justify <- "lrrrr"
# call the predict() function to do the predictions
pred_train <- predict(model, newdata=xtrain)
pred_cal <- predict(model, newdata=xcal)
pred_test <- predict(model, newdata=xtest)
# comparing performance on training, calibration,test
trainperf_df <- performanceMeasures(
ytrain, pred_train, model.name="training", threshold=threshold)
calperf_df <- performanceMeasures(
ycal, pred_cal, model.name="calibration", threshold=threshold)
testperf_df <- performanceMeasures(
ytest, pred_test, model.name="test", threshold=threshold)
# combine the three performance data frames using rbind()
perftable <- rbind(trainperf_df, calperf_df, testperf_df)
pandoc.table(perftable, justify = perf_justify)
}

Call the t_pretty_perf_table() function to output the table:

t_pretty_perf_table(t2model, dTrain[c("predPopulation", "predGross.tertiary.education.enrollment....")], dTrain[,outcome]==pos, dCal[c("predPopulation", "predGross.tertiary.education.enrollment....")], dCal[,outcome]==pos, dTest[c("predPopulation", "predGross.tertiary.education.enrollment....")], dTest[,outcome]==pos)
## 
## 
## model           precision   recall       f1   dev.norm
## ------------- ----------- -------- -------- ----------
## training           0.7143   0.8333   0.7692      1.460
## calibration        1.0000   1.0000   1.0000      1.203
## test               0.3333   0.3333   0.3333      1.148

Add the variable “predUrban_population”

We will now include the “predUrban_population” variable to build a decision tree model. Let’s see how adding “predUrban_population” affects the model’s performance:

(fV <- paste(outcome,'> 0 ~ ',
paste(c("predPopulation", "predGross.tertiary.education.enrollment....", "predUrban_population"), collapse=' + '),
sep=''))
## [1] "Unemployment> 0 ~ predPopulation + predGross.tertiary.education.enrollment.... + predUrban_population"
t3model <- rpart(fV, data=dTrain)
print(calcAUC(predict(t3model, newdata=dTrain), dTrain[,outcome]))
## [1] 0.7833333
print(calcAUC(predict(t3model, newdata=dCal), dCal[,outcome]))
## [1] 1
print(calcAUC(predict(t3model, newdata=dTest), dTest[,outcome]))
## [1] 0.4666667

Calculate the relevant evaluation metrics:

t_pretty_perf_table(t3model, dTrain[c("predPopulation", "predGross.tertiary.education.enrollment....", "predUrban_population")], dTrain[,outcome]==pos, dCal[c("predPopulation", "predGross.tertiary.education.enrollment....", "predUrban_population")], dCal[,outcome]==pos, dTest[c("predPopulation", "predGross.tertiary.education.enrollment....", "predUrban_population")], dTest[,outcome]==pos)
## 
## 
## model           precision   recall       f1   dev.norm
## ------------- ----------- -------- -------- ----------
## training           0.7143   0.8333   0.7692      1.460
## calibration        1.0000   1.0000   1.0000      1.203
## test               0.3333   0.3333   0.3333      1.148

We can observe that whether it’s the AUC value or precision/recall/F1 score/normalized bias, adding the “predUrban_population” variable does not have a significant impact. It doesn’t decrease the model’s performance as expected, but it also doesn’t further improve it.

From the results, we can see that the decision tree model performs relatively well on the training and validation sets but slightly worse on the test set, indicating some overfitting.

Let’s also take a look at the ROC curve for further analysis:

plot_roc <- function(predcol1, outcol1, predcol2, outcol2){
roc_1 <- rocit(score=predcol1, class=outcol1==pos)
roc_2 <- rocit(score=predcol2, class=outcol2==pos)
plot(roc_1, col = c("blue","green"), lwd = 3,
legend = FALSE,YIndex = FALSE, values = TRUE, asp=1)
lines(roc_2$TPR ~ roc_2$FPR, lwd = 3,
col = c("red","green"), asp=1)
legend("bottomright", col = c("blue","red", "green"),
c("Test Data", "Training Data", "Null Model"), lwd = 2)
}
pred_test_roc <- predict(t3model, newdata=dTest)
pred_train_roc <- predict(t3model, newdata=dTrain)
plot_roc(pred_test_roc, dTest[[outcome]],
pred_train_roc, dTrain[[outcome]])

We can see that the model not only suffers from overfitting, but the AUC on the test set is even lower than the null model, indicating that the model’s performance is not very optimistic.

To improve the model, let’s make adjustments to some of the model’s hyperparameters. We will build a new model called “t3model2” based on the three-parameter model.

t3model2 <- rpart(fV, data=dTrain,
control=rpart.control(cp=0.001, minsplit=2,
minbucket=2, maxdepth=5))
print(calcAUC(predict(t3model2, newdata=dTrain[c("predPopulation", "predGross.tertiary.education.enrollment....", "predUrban_population")]), dTrain[,outcome]))
## [1] 0.9666667
print(calcAUC(predict(t3model2, newdata=dCal[c("predPopulation", "predGross.tertiary.education.enrollment....", "predUrban_population")]), dCal[,outcome]))
## [1] 0.6666667
print(calcAUC(predict(t3model2, dTest[c("predPopulation", "predGross.tertiary.education.enrollment....", "predUrban_population")]), dTest[,outcome]))
## [1] 0.4666667
t_pretty_perf_table(t3model2, dTrain[c("predPopulation", "predGross.tertiary.education.enrollment....", "predUrban_population")], dTrain[,outcome]==pos, dCal[c("predPopulation", "predGross.tertiary.education.enrollment....", "predUrban_population")], dCal[,outcome]==pos, dTest[c("predPopulation", "predGross.tertiary.education.enrollment....", "predUrban_population")], dTest[,outcome]==pos)
## 
## 
## model           precision   recall       f1   dev.norm
## ------------- ----------- -------- -------- ----------
## training           0.8462   0.9167   0.8800     0.4321
## calibration        0.5000   1.0000   0.6667     1.1562
## test               0.4000   0.3333   0.3636     0.2092
pred_test_roc <- predict(t3model2, newdata=dTest)
pred_train_roc <- predict(t3model2, newdata=dTrain)
plot_roc(pred_test_roc, dTest[[outcome]],
pred_train_roc, dTrain[[outcome]])

Since your dataset is relatively small, you’ve noticed an improvement in model performance by adjusting the minsplit and minbucket parameters to 2. Precision and F1 on the test set have improved, and dev.norm has decreased. However, the model still suffers from overfitting, which might be due to the limited sample size.

Let’s take a look at the ROC curve for the model before and after the modifications:

pred_test_old <- predict(t3model, newdata=dTest)
pred_test_new <- predict(t3model2, newdata=dTest)
plot_roc(pred_test_old, dTest[[outcome]],
pred_test_new, dTest[[outcome]])

The areas under the curves are the same.

Finally, let’s print the decision tree and create a visualization:

print(t3model2)
## n= 27 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##  1) root 27 6.6666670 0.4444444  
##    2) predGross.tertiary.education.enrollment....< 0.4166713 13 1.6923080 0.1538462  
##      4) predGross.tertiary.education.enrollment....< 0.1667962 7 0.0000000 0.0000000 *
##      5) predGross.tertiary.education.enrollment....>=0.1667962 6 1.3333330 0.3333333  
##       10) predPopulation< 0.1667592 3 0.0000000 0.0000000 *
##       11) predPopulation>=0.1667592 3 0.6666667 0.6666667 *
##    3) predGross.tertiary.education.enrollment....>=0.4166713 14 2.8571430 0.7142857  
##      6) predUrban_population< 0.4166713 6 1.3333330 0.3333333  
##       12) predPopulation< 0.4166713 4 0.7500000 0.2500000 *
##       13) predPopulation>=0.4166713 2 0.5000000 0.5000000 *
##      7) predUrban_population>=0.4166713 8 0.0000000 1.0000000 *
par(cex=1.5)
rpart.plot(t3model2)

2.7 Multivariate Model - Naive Bayes model

To establish a Naive Bayes model using the “predPopulation” and “predGross.tertiary.education.enrollment….” variables.

Following the lab content, use the ‘naiveBayes’ function from the ‘e1071’ package to construct a model. Start with using the two variables with better performance:

# construct the formula f as a character string
(f <- paste(outcome,'> 0 ~ ',
paste(c("predPopulation", "predGross.tertiary.education.enrollment...."), collapse=' + '),
sep=''))
## [1] "Unemployment> 0 ~ predPopulation + predGross.tertiary.education.enrollment...."
nb2model <- naiveBayes(as.formula(f), data=dTrain)

dTrain$nbpred <- predict(nb2model, newdata=dTrain, type='raw')[,'TRUE']
dCal$nbpred <- predict(nb2model, newdata=dCal, type='raw')[,'TRUE']
dTest$nbpred <- predict(nb2model, newdata=dTest, type='raw')[,'TRUE']


cat('The AUC value for the training set is:', calcAUC(dTrain$nbpred, dTrain[,outcome]), "\n")
## The AUC value for the training set is: 0.9527778
cat('The AUC value for the calibration set is:', calcAUC(dCal$nbpred, dCal[,outcome]), "\n")
## The AUC value for the calibration set is: 1
cat('The AUC value for the test set is:', calcAUC(dTest$nbpred, dTest[,outcome]), "\n")
## The AUC value for the test set is: 0.4

To output an evaluation metrics table, we are building a new output table function:

nb_pretty_perf_table <- function(model, xtrain, ytrain, xcal, ycal,
xtest, ytest, threshold=0.5) {
# Option setting for Pander
panderOpt()
perf_justify <- "lrrrr"
# call the predict() function to do the predictions
pred_train <- predict(model, newdata=xtrain, type='raw')[,'TRUE']
pred_cal <- predict(model, newdata=xcal, type='raw')[,'TRUE']
pred_test <- predict(model, newdata=xtest, type='raw')[,'TRUE']
# comparing performance on training, calibration,test
trainperf_df <- performanceMeasures(
ytrain, pred_train, model.name="training", threshold=threshold)
calperf_df <- performanceMeasures(
ycal, pred_cal, model.name="calibration", threshold=threshold)
testperf_df <- performanceMeasures(
ytest, pred_test, model.name="test", threshold=threshold)
# combine the three performance data frames using rbind()
perftable <- rbind(trainperf_df, calperf_df, testperf_df)
pandoc.table(perftable, justify = perf_justify)
}

Build the table:

nb_pretty_perf_table(nb2model, dTrain[c("predPopulation", "predGross.tertiary.education.enrollment....")], dTrain[,outcome]==pos, dCal[c("predPopulation", "predGross.tertiary.education.enrollment....")], dCal[,outcome]==pos, dTest[c("predPopulation", "predGross.tertiary.education.enrollment....")], dTest[,outcome]==pos)
## 
## 
## model           precision   recall       f1   dev.norm
## ------------- ----------- -------- -------- ----------
## training           0.7857   0.9167   0.8462      3.240
## calibration        0.6667   1.0000   0.8000      3.374
## test               0.2857   0.3333   0.3077      2.657

Plot the ROC curve:

pred_test_roc <- predict(nb2model, newdata=dTest, type='raw')[,'TRUE']
pred_train_roc <- predict(nb2model, newdata=dTrain, type= 'raw')[,'TRUE']
plot_roc(pred_test_roc, dTest[[outcome]],
pred_train_roc, dTrain[[outcome]])

Based on the data results and the ROC curve above, we can observe that even when using the Naive Bayes model, the performance on the test set is still not as good as the null model.

Add the variable “predUrban_population”

Let’s try adding the variable “predUrban_population” to the model and see if it improves the performance:

(f <- paste(outcome,'> 0 ~ ',
paste(c("predPopulation", "predGross.tertiary.education.enrollment....", "predUrban_population"), collapse=' + '),
sep=''))
## [1] "Unemployment> 0 ~ predPopulation + predGross.tertiary.education.enrollment.... + predUrban_population"
nb3model <- naiveBayes(as.formula(f), data=dTrain)

dTrain$nbpred <- predict(nb3model, newdata=dTrain, type='raw')[,'TRUE']
dCal$nbpred <- predict(nb3model, newdata=dCal, type='raw')[,'TRUE']
dTest$nbpred <- predict(nb3model, newdata=dTest, type='raw')[,'TRUE']


cat('The AUC value for the training set is:', calcAUC(dTrain$nbpred, dTrain[,outcome]), "\n")
## The AUC value for the training set is: 0.95
cat('The AUC value for the calibration set is:', calcAUC(dCal$nbpred, dCal[,outcome]), "\n")
## The AUC value for the calibration set is: 0.8333333
cat('The AUC value for the test set is:', calcAUC(dTest$nbpred, dTest[,outcome]), "\n")
## The AUC value for the test set is: 0.5416667

After adding the new variable, there is a slight improvement in the AUC value on the test set, even though this variable was initially considered to have a negative impact on the model. Let’s continue to observe the evaluation metrics and the ROC curve.

nb_pretty_perf_table(nb3model, dTrain[c("predPopulation", "predGross.tertiary.education.enrollment....", "predUrban_population")], dTrain[,outcome]==pos, dCal[c("predPopulation", "predGross.tertiary.education.enrollment....", "predUrban_population")], dCal[,outcome]==pos, dTest[c("predPopulation", "predGross.tertiary.education.enrollment....", "predUrban_population")], dTest[,outcome]==pos)
## 
## 
## model           precision   recall       f1   dev.norm
## ------------- ----------- -------- -------- ----------
## training              1.0   0.8333   0.9091      4.154
## calibration           0.5   0.5000   0.5000      4.621
## test                  0.4   0.3333   0.3636      3.392
pred_test_roc <- predict(nb3model, newdata=dTest, type='raw')[,'TRUE']
pred_train_roc <- predict(nb3model, newdata=dTrain, type= 'raw')[,'TRUE']
plot_roc(pred_test_roc, dTest[[outcome]],
pred_train_roc, dTrain[[outcome]])

The test set precision and F1 score have both improved, and the training set precision even reached a perfect score. However, dev.norm has also shown some improvement. Observing the ROC curve, we can see that the performance has indeed improved with the addition of a variable. However, severe overfitting still exists.

Let’s examine the dual density plot:

ggplot(dCal) + geom_density(aes(x=nbpred, color=as.factor(Unemployment)))

In comparison to the previous plot, it’s evident that the dual density plot of this model has a larger area under the positive curve when using the negative curve as a reference. This indicates that the model’s predictions for the negative class have improved. Similar to before, the area under the positive curve in the high-frequency region is larger, indicating better performance in predicting the positive class.

However, the model still exhibits overfitting issues, which are likely related to the small dataset size and data quality concerns.

Furthermore, we do not consider applying LIME explanations to the above two models. This is because decision trees inherently possess high interpretability, with each node corresponding to a criterion based on a specific feature. Thus, the model’s prediction process can be intuitively understood. Naive Bayes utilizes conditional probability relationships between features for classification, and these conditional probabilities can typically be directly estimated from the data. Consequently, the model’s operational principles are transparent, eliminating the need for additional explanations. These two models do not fall into the category of “black-box” models, as their decision processes are highly transparent, rendering LIME explanations unnecessary.

2.8 Model Comparison and Evaluation.

Let’s begin by comparing the performance of the decision tree model and the Naive Bayes model for the same problem, starting with a comparison of the AUC values:

cat('The AUC value for the test set is in tree:', calcAUC(predict(t3model2, dTest[c("predPopulation", "predGross.tertiary.education.enrollment....", "predUrban_population")]), dTest[,outcome]), "\n")
## The AUC value for the test set is in tree: 0.4666667
cat('The AUC value for the test set is in nb:', calcAUC(dTest$nbpred, dTest[,outcome]), "\n")
## The AUC value for the test set is in nb: 0.5416667

It’s clear that Naive Bayes significantly outperforms the decision tree, while the decision tree even has an AUC value below 0.5, performing worse than the null model.

Let’s also consider the AUC values of the three univariate models for further observation:

for (var in vars) {
aucs <- rep(0,100)
for (rep in 1:length(aucs)) {
useForCalRep <- rbinom(n=nrow(dTrainAll), size=1, prob=0.1) > 0
# Check if the training subset contains only one class
while (length(unique(dTrainAll[useForCalRep, outcome])) < 2) {
  useForCalRep <- rbinom(n = nrow(dTrainAll), size = 1, prob = 0.1) > 0
}
predRep <- mkPredN(dTrainAll[!useForCalRep, outcome],
dTrainAll[!useForCalRep, var],
dTrainAll[useForCalRep, var])
aucs[rep] <- calcAUC(predRep, dTrainAll[useForCalRep,outcome])
}
print(sprintf("The AUC value for %s is: %4.3f", var, mean(aucs)))
}
## [1] "The AUC value for Population is: 0.616"
## [1] "The AUC value for Gross.tertiary.education.enrollment.... is: 0.737"
## [1] "The AUC value for Urban_population is: 0.414"

From these results, it’s evident that the performance of the two multivariate models is not better than that of the two best-performing univariate models.Next, let’s compare the evaluation metrics for different models:

Naive Bayes:

nb_pretty_perf_table(nb3model, dTrain[c("predPopulation", "predGross.tertiary.education.enrollment....", "predUrban_population")], dTrain[,outcome]==pos, dCal[c("predPopulation", "predGross.tertiary.education.enrollment....", "predUrban_population")], dCal[,outcome]==pos, dTest[c("predPopulation", "predGross.tertiary.education.enrollment....", "predUrban_population")], dTest[,outcome]==pos)
## 
## 
## model           precision   recall       f1   dev.norm
## ------------- ----------- -------- -------- ----------
## training              1.0   0.8333   0.9091      4.154
## calibration           0.5   0.5000   0.5000      4.621
## test                  0.4   0.3333   0.3636      3.392

Decision tree:

t_pretty_perf_table(t3model2, dTrain[c("predPopulation", "predGross.tertiary.education.enrollment....", "predUrban_population")], dTrain[,outcome]==pos, dCal[c("predPopulation", "predGross.tertiary.education.enrollment....", "predUrban_population")], dCal[,outcome]==pos, dTest[c("predPopulation", "predGross.tertiary.education.enrollment....", "predUrban_population")], dTest[,outcome]==pos)
## 
## 
## model           precision   recall       f1   dev.norm
## ------------- ----------- -------- -------- ----------
## training           0.8462   0.9167   0.8800     0.4321
## calibration        0.5000   1.0000   0.6667     1.1562
## test               0.4000   0.3333   0.3636     0.2092

From the parameters, it appears that the two models have nearly identical performance, especially on the test set where the first three metrics are the same. However, the decision tree has slightly lower bias, indicating relatively better model performance.

Finally, let’s draw the ROC curves for comparison. We’ll choose the two best-performing univariate models and the two multivariate models to plot on the test set:

plot_roc <- function(predcol1, outcol1, predcol2, outcol2, predcol3, outcol3, predcol4, outcol4){
  roc_1 <- rocit(score=predcol1, class=outcol1==pos)
  roc_2 <- rocit(score=predcol2, class=outcol2==pos)
  roc_3 <- rocit(score=predcol3, class=outcol3==pos)
  roc_4 <- rocit(score=predcol4, class=outcol4==pos)
  
  plot(roc_1, col = "blue", lwd = 2, legend = FALSE, YIndex = FALSE, values = TRUE, cex = 3)
  lines(roc_2$TPR ~ roc_2$FPR, col = "red", lwd = 2, cex = 3)
  lines(roc_3$TPR ~ roc_3$FPR, col = "green", lwd = 2, cex = 3)
  lines(roc_4$TPR ~ roc_4$FPR, col = "orange", lwd = 2, cex = 3)
  
  legend("bottomright", col = c("blue", "red", "green", "orange"),
         c("PopSinModel", "GroSinModel", "DtreeModel", "NBModel"), lwd = 2, cex = 1)
}

pred_Pop_roc <- dTest$predPopulation
pred_Gro_roc <- dTest$predGross.tertiary.education.enrollment....
pred_tree_roc <- pred_test_new
pred_nb_roc <- pred_test_roc
plot_roc(pred_Pop_roc, dTest[,outcome], pred_Gro_roc, dTest[,outcome], pred_tree_roc, dTest[,outcome], pred_nb_roc, dTest[,outcome])

From this graph, we can analyze that there are a total of five models (including the null model), and they show relatively similar performance based on the AUC. Interestingly, the univariate model of the education rate and the decision tree model exhibit similar trends when the false positive rate (FPR) is greater than 0.4. There might be some underlying correlation here, which could be a topic for further investigation in this project.

In summary, overall, the two multivariate models didn’t outperform the top-performing univariate models. This may be due to the limited dataset size and a small number of variables, making univariate models more suitable in this scenario. The two multivariate models show similar performance and have their respective strengths, but they both exhibit overfitting issues. In terms of variables, the total population and education rate of a country have a significant impact on the unemployment rate prediction, while the urban population rate has less influence.

Part 2 - Clustering

Problem Analysis and Data Processing

In the problem analysis and data processing phase, we have a cleaned data frame, ‘df,’ which contains 48 countries and 4 observational metrics, including Population, Gross tertiary education enrollment, Unemployment rate, and Urban population. For the clustering problem, we also want to include longitude and latitude as reference metrics. In the previous analysis, it was observed that there is an equal number of missing values for longitude, latitude, and unemployment rate in the ‘youtubedata’ data frame, leading to the suspicion that the missing rows are the same. Let’s conduct a check to verify this:

count_suspected_na_rows <- sum(rowSums(is.na(youtubedataT2[, c("Population", "Unemployment.rate", "Urban_population", "Latitude",   "Longitude")])) == 5)
print(count_suspected_na_rows)
## [1] 123

Indeed, it appears that the missing rows in the ‘youtubedata’ are the same as those in the ‘df’ data frame. So, let’s start by extracting the columns from the ‘youtubedata’ that contain country names and longitude/latitude. We will then join these columns with the ‘df’ data frame based on the country names.

country_lon_lat <- youtubedataT2 %>% 
  select(Country, Latitude, Longitude) %>%
  filter(!is.na(Latitude))
country_lon_lat <- unique(country_lon_lat)

# Using left_join function
df <- left_join(df, country_lon_lat, by = "Country")

Let’s take a look at the new data frame.

str(df)
## 'data.frame':    48 obs. of  7 variables:
##  $ Country                                : chr  "India" "United States" "Japan" "Russia" ...
##  $ Population                             : num  1.37e+09 3.28e+08 1.26e+08 1.44e+08 5.17e+07 ...
##  $ Gross.tertiary.education.enrollment....: num  28.1 88.2 63.2 81.9 94.3 60 68.9 51.3 90 88.5 ...
##  $ Unemployment.rate                      : num  5.36 14.7 2.29 4.59 4.15 ...
##  $ Urban_population                       : num  4.71e+08 2.71e+08 1.16e+08 1.08e+08 4.21e+07 ...
##  $ Latitude                               : num  20.6 37.1 36.2 61.5 35.9 ...
##  $ Longitude                              : num  79 -95.7 138.3 105.3 127.8 ...
summary(df)
##    Country            Population        Gross.tertiary.education.enrollment....
##  Length:48          Min.   :2.025e+05   Min.   : 7.60                          
##  Class :character   1st Qu.:1.583e+07   1st Qu.:35.80                          
##  Mode  :character   Median :4.185e+07   Median :57.45                          
##                     Mean   :1.200e+08   Mean   :55.17                          
##                     3rd Qu.:9.744e+07   3rd Qu.:72.85                          
##                     Max.   :1.398e+09   Max.   :94.30                          
##  Unemployment.rate Urban_population       Latitude        Longitude      
##  Min.   : 0.750    Min.   :    35588   Min.   :-38.42   Min.   :-172.10  
##  1st Qu.: 3.743    1st Qu.:  9651217   1st Qu.: 11.27   1st Qu.: -60.56  
##  Median : 5.315    Median : 30732090   Median : 28.07   Median :  25.18  
##  Mean   : 6.507    Mean   : 69730921   Mean   : 24.23   Mean   :  16.65  
##  3rd Qu.: 9.193    3rd Qu.: 57178091   3rd Qu.: 40.82   3rd Qu.:  81.81  
##  Max.   :14.720    Max.   :842933962   Max.   : 61.92   Max.   : 138.25

There are no missing values, and the ranges of all numerical values are reasonable.

To mitigate the impact of distance due to unit discrepancies, we will first preprocess the data. We’ll use the scale() function in R to transform all columns, setting their means to 0 and standard deviations to 1.

We’ll extract all columns except the first one, which represents the countries, and apply the scale() function. Then, we’ll use the head() function to view the updated data frame.

vars.to.use <- colnames(df)[-1] 
scaled_df <- scale(df[,vars.to.use]) 
head(scaled_df)
##       Population Gross.tertiary.education.enrollment.... Unemployment.rate
## [1,]  4.52338317                              -1.0932714        -0.3007812
## [2,]  0.75558519                               1.3342759         2.1487417
## [3,]  0.02243169                               0.3244808        -1.1059242
## [4,]  0.08829138                               1.0798075        -0.5027226
## [5,] -0.24801010                               1.5806659        -0.6181177
## [6,] -0.19311673                               0.1952270        -0.6967962
##      Urban_population   Latitude  Longitude
## [1,]       2.88159445 -0.1452756  0.8016324
## [2,]       1.44282075  0.5138341 -1.4456142
## [3,]       0.33067912  0.4784579  1.5644134
## [4,]       0.27252653  1.4900703  1.1407072
## [5,]      -0.19835940  0.4665887  1.4295084
## [6,]      -0.09925512  1.2445123 -0.2584490

Using hierarchical cluster analysis.

We will begin by performing hierarchical clustering analysis using the Euclidean distance and the “ward.D2” linkage method. To find the best model, we can explore various options by changing these two parameters.

For the dist() function, we can consider the following methods for distance calculation: “euclidean,” “manhattan,” “maximum,” “canberra,” “binary,” and “minkowski.”

For the hclust() function, we can explore various linkage methods, including “complete,” “single,” “average,” “ward.D,” “ward.D2,” and “centroid.”

Let’s proceed with hierarchical clustering using the Euclidean distance and “ward.D2” linkage method. If you wish to explore different combinations, please specify your preferences.

d <- dist(scaled_df, method="euclidean")
pfit <- hclust(d, method="ward.D2")

To visualize the hierarchical clustering, we will create a dendrogram. We will proceed with ‘k = 5’ clusters for now. We can explore other values of ‘k’ later if needed.

Let’s create the dendrogram with ‘k = 5’ clusters:

plot(pfit, labels=df$Country, main="Cluster Dendrogram for Country", cex = 0.7)
rect.hclust(pfit, k=5)

To gain a clearer understanding of the characteristics of each group’s members, we’ll use the cutree() function to extract the group assignments for each member.

groups <- cutree(pfit, k=5)

Let’s create a function to print the feature values for each group:

print_clusters <- function(df, groups, cols_to_print) {
Ngroups <- max(groups)
  for (i in 1:Ngroups) {
    print(paste("cluster", i))
    print(df[groups == i, cols_to_print])
  }
}

Call the function and select the features you want to analyze:

cols_to_print <- c("Country", "Population", "Gross.tertiary.education.enrollment....", "Unemployment.rate", "Urban_population", "Latitude", "Longitude")
print_clusters(df, groups, cols_to_print)
## [1] "cluster 1"
##    Country Population Gross.tertiary.education.enrollment.... Unemployment.rate
## 1    India 1366417754                                    28.1              5.36
## 40   China 1397715000                                    50.6              4.32
##    Urban_population Latitude Longitude
## 1         471031528 20.59368  78.96288
## 40        842933962 35.86166 104.19540
## [1] "cluster 2"
##          Country Population Gross.tertiary.education.enrollment....
## 2  United States  328239523                                    88.2
## 8         Brazil  212559417                                    51.3
## 9      Argentina   44938712                                    90.0
## 10         Chile   18952038                                    88.5
## 11          Cuba   11333483                                    41.4
## 12   El Salvador    6453553                                    29.4
## 16      Colombia   50339443                                    55.3
## 17      Barbados     287025                                    65.4
## 18        Mexico  126014024                                    40.2
## 20         Spain   47076781                                    88.9
## 24     Venezuela   28515829                                    79.3
## 43       Ecuador   17373662                                    44.9
## 45          Peru   32510453                                    70.7
## 48         Samoa     202506                                     7.6
##    Unemployment.rate Urban_population   Latitude  Longitude
## 2              14.70        270663028  37.090240  -95.71289
## 8              12.08        183241641 -14.235004  -51.92528
## 9               9.79         41339571 -38.416097  -63.61667
## 10              7.09         16610135 -35.675147  -71.54297
## 11              1.64          8739135  21.521757  -77.78117
## 12              4.11          4694702  13.794185  -88.89653
## 16              9.71         40827302   4.570868  -74.29733
## 17             10.33            89431  13.193887  -59.54320
## 18              3.42        102626859  23.634501 -102.55278
## 20             13.96         37927409  40.463667   -3.74922
## 24              8.80         25162368   6.423750  -66.58973
## 43              3.97         11116711  -1.831239  -78.18341
## 45              3.31         25390339  -9.189967  -75.01515
## 48              8.36            35588 -13.759029 -172.10463
## [1] "cluster 3"
##                 Country Population Gross.tertiary.education.enrollment....
## 3                 Japan  126226568                                    63.2
## 4                Russia  144373535                                    81.9
## 5           South Korea   51709098                                    94.3
## 13             Pakistan  216565318                                     9.0
## 14          Philippines  108116615                                    35.5
## 15             Thailand   69625582                                    49.3
## 19 United Arab Emirates    9770529                                    36.8
## 22            Indonesia  270203917                                    36.3
## 25               Kuwait    4207083                                    54.4
## 28            Singapore    5703569                                    84.8
## 29            Australia   25766605                                    68.0
## 38              Vietnam   96462106                                    28.5
## 39             Malaysia   32447385                                    45.1
## 46           Bangladesh  167310838                                    20.6
##    Unemployment.rate Urban_population   Latitude Longitude
## 3               2.29        115782416  36.204824 138.25292
## 4               4.59        107683889  61.524010 105.31876
## 5               4.15         42106719  35.907757 127.76692
## 13              4.45         79927762  30.375321  69.34512
## 14              2.15         50975903  12.879721 121.77402
## 15              0.75         35294600  15.870032 100.99254
## 19              2.35          8479744  23.424076  53.84782
## 22              4.69        151509724  -0.789275 113.92133
## 25              2.18          4207083  29.311660  47.48177
## 28              4.11          5703569   1.352083 103.81984
## 29              5.27         21844756 -25.274398 133.77514
## 38              2.01         35332140  14.058324 108.27720
## 39              3.32         24475766   4.210484 101.97577
## 46              4.19         60987417  23.684994  90.35633
## [1] "cluster 4"
##           Country Population Gross.tertiary.education.enrollment....
## 6  United Kingdom   66834405                                    60.0
## 7          Canada   36991981                                    68.9
## 21   Saudi Arabia   34268528                                    68.0
## 27    Netherlands   17332850                                    85.0
## 30          Italy   60297396                                    61.9
## 31        Germany   83132799                                    70.2
## 32         France   67059887                                    65.6
## 33         Sweden   10285453                                    67.0
## 35        Ukraine   44385155                                    82.7
## 36         Latvia    1912789                                    88.1
## 37    Switzerland    8574832                                    59.6
## 47        Finland    5520314                                    88.2
##    Unemployment.rate Urban_population Latitude   Longitude
## 6               3.85         55908316 55.37805   -3.435973
## 7               5.56         30628482 56.13037 -106.346771
## 21              5.93         28807838 23.88594   45.079162
## 27              3.20         15924729 52.13263    5.291266
## 30              9.89         42651966 41.87194   12.567380
## 31              3.04         64324835 51.16569   10.451526
## 32              8.43         54123364 46.22764    2.213749
## 33              6.48          9021165 60.12816   18.643501
## 35              8.88         30835699 48.37943   31.165580
## 36              6.52          1304943 56.87964   24.603189
## 37              4.58          6332428 46.81819    8.227512
## 47              6.59          4716888 61.92411   25.748151
## [1] "cluster 5"
##        Country Population Gross.tertiary.education.enrollment....
## 23      Turkey   83429615                                    23.9
## 26      Jordan   10101694                                    34.4
## 34 Afghanistan   38041754                                     9.7
## 41        Iraq   39309783                                    16.2
## 42       Egypt  100388073                                    35.2
## 44     Morocco   36910560                                    35.9
##    Unemployment.rate Urban_population Latitude Longitude
## 23             13.49         63097818 38.96375  35.24332
## 26             14.72          9213048 30.58516  36.23841
## 34             11.12          9797273 33.93911  67.70995
## 41             12.82         27783368 33.22319  43.67929
## 42             10.76         42895824 26.82055  30.80250
## 44              9.02         22975026 31.79170  -7.09262

A preliminary analysis of these clusters reveals the following:

Cluster 1: Includes India and China, both densely populated countries with a significant urban population. These two countries are located in East Asia.

Cluster 3: This cluster includes countries like Japan, Russia, South Korea, Pakistan, and the Philippines. The unemployment rate is relatively low in these countries. They are distributed across Asia and the Russian region.

Cluster 4: This cluster comprises countries such as the United Kingdom, Canada, Saudi Arabia, the Netherlands, Italy, and Germany. This cluster has a high gross tertiary education enrollment rate. These countries are primarily located in Europe and the Middle East.

Cluster 5: This cluster includes countries like Turkey, Jordan, Afghanistan, Iraq, Egypt, and Morocco. The gross tertiary education enrollment rate is relatively low, and the unemployment rate is high in these countries. They are primarily located in the Middle East and North Africa.

Cluster 2: The characteristics of this cluster are not very distinct.

These findings provide an initial understanding of the clusters and their distinguishing features. Further analysis and interpretation can be performed based on these observations.

Performance Evaluation for Different Values of ‘k’

In order to find the optimal value for performance, we create a function to calculate the CH index and WSS values of the model and generate plots of k against these two parameters.

sqr_euDist <- function(x, y) {
sum((x - y) ^ 2)
}

# Function to calculate WSS of a cluster
wss <- function(clustermat) {
c0 <- colMeans(clustermat)
sum(apply( clustermat, 1, FUN=function(row) {sqr_euDist(row, c0)} ))
}

# Function to calculate the total WSS
wss_total <- function(scaled_df, labels) {
wss.sum <- 0
k <- length(unique(labels))
for (i in 1:k)
wss.sum <- wss.sum + wss(subset(scaled_df, labels == i))
wss.sum
}

# Function to calculate total sum of squared (TSS) distance of data points about the (global) mean
tss <- function(scaled_df) {
wss(scaled_df)
}

CH_index <- function(scaled_df, kmax, method="kclust") {
npts <- nrow(scaled_df)
wss.value <- numeric(kmax) 

wss.value[1] <- wss(scaled_df)

d <- dist(scaled_df, method="euclidean")
pfit <- hclust(d, method="ward.D2")
for (k in 2:kmax) {
labels <- cutree(pfit, k=k)
wss.value[k] <- wss_total(scaled_df, labels)
}
bss.value <- tss(scaled_df) - wss.value 
B <- bss.value / (0:(kmax-1))
W <- wss.value / (npts - 1:kmax)
data.frame(k = 1:kmax, CH_index = B/W, WSS = wss.value)
}

# calculate the CH criterion
crit.df <- CH_index(scaled_df, 10, method="hclust")
fig1 <- ggplot(crit.df, aes(x=k, y=CH_index)) +
geom_point() + geom_line(colour="red") +
scale_x_continuous(breaks=1:10, labels=1:10) +
labs(y="CH index") + theme(text=element_text(size=20))
fig2 <- ggplot(crit.df, aes(x=k, y=WSS), color="blue") +
geom_point() + geom_line(colour="blue") +
scale_x_continuous(breaks=1:10, labels=1:10) +
theme(text=element_text(size=20))

grid.arrange(fig1, fig2, nrow=1)

We aim for a higher CH index value and a lower WSS value. The hope is that the rate at which the WSS decreases will slow down for k values beyond the optimal number of clusters. Therefore, for the CH index, the best performance is achieved when it reaches its maximum value, which in the current situation is k = 8. As for the WSS, the “elbow” point on the graph is the preferred choice, although there is no clear elbow in the current scenario. So, we select k = 8 as the optimal k value.

plot(pfit, labels=df$Country, main="Cluster Dendrogram for Country")
rect.hclust(pfit, k=8)

Stability Assessment

After selecting the optimal k value, we conduct a stability assessment of the algorithm using clusterboot().

kbest.p <- 8
cboot.hclust <- clusterboot(scaled_df, clustermethod=hclustCBI,
method="ward.D2", k=kbest.p)

Review the execution results:

summary(cboot.hclust$result)
##               Length Class  Mode     
## result         7     hclust list     
## noise          1     -none- logical  
## nc             1     -none- numeric  
## clusterlist    8     -none- list     
## partition     48     -none- numeric  
## clustermethod  1     -none- character
## nccl           1     -none- numeric

And we alse counted how many times each cluster was dissolved:

# cboot.hclust$bootbrd = number of times a cluster is desolved.
(values <- 1 - cboot.hclust$bootbrd/100) 
## [1] 0.89 0.42 0.57 0.95 0.77 0.88 0.93 0.96
# larger values here means higher stablility
cat("So clusters", order(values)[5], "and", order(values)[4], "are highly stable")
## So clusters 1 and 6 are highly stable
(mean(values))
## [1] 0.79625

The model’s average values are 0.79625.

Method Selection

We can optimize the model and explore problems by changing the algorithm’s Distance Method and Hierarchical Clustering Method. Different Distance Methods are suitable for different problems, and we can find the most suitable method for this problem through experimentation. For example, selecting “manhattan”:

d <- dist(scaled_df, method = "manhattan")
    pfit <- hclust(d, method = "ward.D2")

View the CH index and WSS values:

CH_index <- function(scaled_df, kmax, method="kclust") {
npts <- nrow(scaled_df)
wss.value <- numeric(kmax) 

wss.value[1] <- wss(scaled_df)

d <- dist(scaled_df, method = "manhattan")
pfit <- hclust(d, method = "ward.D2")
for (k in 2:kmax) {
labels <- cutree(pfit, k=k)
wss.value[k] <- wss_total(scaled_df, labels)
}
bss.value <- tss(scaled_df) - wss.value 
B <- bss.value / (0:(kmax-1))
W <- wss.value / (npts - 1:kmax)
data.frame(k = 1:kmax, CH_index = B/W, WSS = wss.value)
}

# calculate the CH criterion
crit.df <- CH_index(scaled_df, 10, method="hclust")
fig1 <- ggplot(crit.df, aes(x=k, y=CH_index)) +
geom_point() + geom_line(colour="red") +
scale_x_continuous(breaks=1:10, labels=1:10) +
labs(y="CH index") + theme(text=element_text(size=20))
fig2 <- ggplot(crit.df, aes(x=k, y=WSS), color="blue") +
geom_point() + geom_line(colour="blue") +
scale_x_continuous(breaks=1:10, labels=1:10) +
theme(text=element_text(size=20))

grid.arrange(fig1, fig2, nrow=1)

We will find that changing the distance method will affect the optimal k value of the model. Therefore, under different methods, we need to use different numbers of clusters for clustering. At this point, the optimal k value is 6. Check for stability:

kbest.p <- 6
cboot.hclust <- clusterboot(scaled_df, clustermethod=hclustCBI,
method="ward.D2", k=kbest.p)

Review the results:

(values <- 1 - cboot.hclust$bootbrd/100) 
## [1] 0.93 0.56 0.98 0.95 0.76 0.91
(mean(values))
## [1] 0.8483333

The mean value is 0.848333. The stability has increased with the “manhattan” model.

Now, let’s try different hierarchical clustering methods, such as selecting “complete”:

d <- dist(scaled_df, method = "euclidean")
pfit <- hclust(d, method = "complete")

View CH index and WSS values:

CH_index <- function(scaled_df, kmax, method="kclust") {
npts <- nrow(scaled_df)
wss.value <- numeric(kmax) 

wss.value[1] <- wss(scaled_df)

d <- dist(scaled_df, method = "euclidean")
pfit <- hclust(d, method = "complete")
for (k in 2:kmax) {
labels <- cutree(pfit, k=k)
wss.value[k] <- wss_total(scaled_df, labels)
}
bss.value <- tss(scaled_df) - wss.value 
B <- bss.value / (0:(kmax-1))
W <- wss.value / (npts - 1:kmax)
data.frame(k = 1:kmax, CH_index = B/W, WSS = wss.value)
}

# calculate the CH criterion
crit.df <- CH_index(scaled_df, 10, method="hclust")
fig1 <- ggplot(crit.df, aes(x=k, y=CH_index)) +
geom_point() + geom_line(colour="red") +
scale_x_continuous(breaks=1:10, labels=1:10) +
labs(y="CH index") + theme(text=element_text(size=20))
fig2 <- ggplot(crit.df, aes(x=k, y=WSS), color="blue") +
geom_point() + geom_line(colour="blue") +
scale_x_continuous(breaks=1:10, labels=1:10) +
theme(text=element_text(size=20))

grid.arrange(fig1, fig2, nrow=1)

Select 7 as the optimal k value:

kbest.p <- 7
cboot.hclust <- clusterboot(scaled_df, clustermethod=hclustCBI,
method="complete", k=kbest.p)
(values <- 1 - cboot.hclust$bootbrd/100) 
## [1] 0.90 0.49 0.93 0.67 0.71 0.88 0.90
(mean(values))
## [1] 0.7828571

The mean is 0.7828571. Under the same distance method, this model’s stability is not as good as when using “ward.D2”.

Data Visualization

Finally, we proceed with data visualization for different values of ‘k’ by “euclidean” dist method and “ward.D2” hclust method.

d <- dist(scaled_df, method="euclidean")
pfit <- hclust(d, method="ward.D2")

We will use the ‘procomp()’ function to perform Principal Component Analysis (PCA) and reduce the data to two dimensions for ease of plotting.

princ <- prcomp(scaled_df)
# focus on the first two principal components
nComp <- 2

First, define a function to find the convex hull:

find_convex_hull <- function(proj2Ddf, groups) {
  do.call(rbind,
          lapply(unique(groups),
                FUN = function(c) {
                  f <- subset(proj2Ddf, cluster==c);
                  f[chull(f),]
                }
              )
          )
}

Using a loop, let’s create plots for different values of ‘k’:

fig <- c()
kvalues <- seq(5,8)
for (i in kvalues) {
  groups <- cutree(pfit, k = i)
  # project scaled_df onto the first 2 principal components to form a new 2-column data frame.
  project2D <- as.data.frame(predict(princ, newdata=scaled_df)[,1:nComp])
  # combine with `groups` and df$Country to form a 4-column data frame
  hclust.project2D <- cbind(project2D, cluster=as.factor(groups), country=df$Country)
  hclust.hull <- find_convex_hull(hclust.project2D, groups)
  assign(paste0("fig", i),
    ggplot(hclust.project2D, aes(x=PC1, y=PC2)) +
    geom_point(aes(shape=cluster, color=cluster)) +
    geom_polygon(data=hclust.hull, aes(group=cluster, fill=as.factor(cluster)),
    alpha=0.4, linetype=0) +
    scale_x_continuous(limits = c(-0.1, 0.07)) +
    scale_y_continuous(limits = c(-0.3, 0.4)) +
    labs(title = sprintf("k = %d", i)) +
    theme(legend.position="none", text=element_text(size=10))
  )
}

grid.arrange(fig5, fig6, fig7, fig8, ncol=2, heights = c(10, 10), widths = c(10, 10))

From this visualization, we can observe that regardless of the value of ‘k,’ there is significant overlap between each cluster. This indicates a high degree of similarity between different clusters, making it challenging to distinguish them distinctly. This might be due to the choice of features or distance metrics not effectively separating these clusters, or it could be related to the data not being sufficiently dispersed.

7 Reference

In the data analysis process, some syntax generated by Chart GPT was referenced, replacing some of the variables within it.

8 Shinny app

The code can be seen and executed in the R file. ## link https://youtu.be/BQxrAuu6M94